A partir de los datos recopilados por Google Analytics de una e-commerce, se quiere predecir los ingresos por las transacciones hechas cada vez que un usuario entre en la página web.
# Elimina duplicados
def drop_duplicates (df):
'''
Elimina filas duplicadas del dataset
'''
print(f"{df.shape}")
df_duplicated = df[df.duplicated(keep = False)]
print(f"Se han eliminado {len(df[df.duplicated()])} registros repetidos")
df.drop_duplicates(inplace = True)
print(f"{df.shape}")
return df_duplicated
# Eliminación de variables numéricas con baja varianza
def varianza_nula (dataframe, std):
'''
Elimina variables numéricas con baja varianza
'''
df_numerical_describe = dataframe.describe(include=np.number).T
lista_numerica_baja_varianza = df_numerical_describe[(df_numerical_describe['std']<std)].index.tolist() # Indico la desviación típica (0.15)
dataframe.drop(lista_numerica_baja_varianza, axis=1, inplace=True)
print('Se han borrado las siguientes variables numéricas por tener baja varianza:\n',lista_numerica_baja_varianza )
return dataframe.head()
import sys
import os
# silence warnings
import warnings
warnings.filterwarnings("ignore")
import random
import json
from pandas.io.json import json_normalize # normaliza los datos en formato json
import numpy as np # Arrays
import pandas as pd # Series and Dataframes
# Para tratar variables con fechas
from datetime import datetime
# Modeling
from sklearn import model_selection
import xgboost as xgb
# Plotting
import matplotlib.pyplot as plt
plt.style.use('ggplot') # Estilo pyplot (ggplot proviene de R --> Beautify Plots)
import seaborn as sns # Advanced Plotting
from scipy import stats # Para estadística (por ejemplo para decirnos si una distribución es normal)
# cargamos archivos desde el colab
#from google import files
#from google import drive
pd.options.display.max_rows = 100
# Garantiza que la aleatoriedad sea siempre la misma
RANDOM_STATE = 42
#Si se usa Colab monto la carpeta en la que tengo los datos
'''
drive.mount('/content/gdrive')
'''
"\ndrive.mount('/content/gdrive')\n"
# Genero las carpetas que se van a utilizar a lo largo del notebook
import os
for i in ['data', 'report', 'pickle', 'final']:
try:
os.mkdir(i)
except FileExistsError:
pass
Descargamos los datos y los metemos en la carpeta 'data': https://drive.google.com/file/d/1quBZ75bubrnaPD3jUBsuc7lWM23jnFcy/view?usp=drive_link
# Asigno la ruta actual
file_dir = os.chdir('D:\Data_Science\MACHINE_LEARNING\ML_SUPERVISADO\ML_SUPERVISADO_REGRESION\Machine-Learning-Regression-GACR')
file_name = '\data\GACR.csv'
# Veo la ruta actual
file_dir = os.getcwd()
file_dir
'D:\\Data_Science\\MACHINE_LEARNING\\ML_SUPERVISADO\\ML_SUPERVISADO_REGRESION\\Machine-Learning-Regression-GACR'
file_dir+file_name
'D:\\Data_Science\\MACHINE_LEARNING\\ML_SUPERVISADO\\ML_SUPERVISADO_REGRESION\\Machine-Learning-Regression-GACR\\data\\GACR.csv'
pd.read_csv??
Signature: pd.read_csv( filepath_or_buffer: 'FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str]', sep=<no_default>, delimiter=None, header='infer', names=<no_default>, index_col=None, usecols=None, squeeze=None, prefix=<no_default>, mangle_dupe_cols=True, dtype: 'DtypeArg | None' = None, engine: 'CSVEngine | None' = None, converters=None, true_values=None, false_values=None, skipinitialspace=False, skiprows=None, skipfooter=0, nrows=None, na_values=None, keep_default_na=True, na_filter=True, verbose=False, skip_blank_lines=True, parse_dates=None, infer_datetime_format=False, keep_date_col=False, date_parser=None, dayfirst=False, cache_dates=True, iterator=False, chunksize=None, compression: 'CompressionOptions' = 'infer', thousands=None, decimal: 'str' = '.', lineterminator=None, quotechar='"', quoting=0, doublequote=True, escapechar=None, comment=None, encoding=None, encoding_errors: 'str | None' = 'strict', dialect=None, error_bad_lines=None, warn_bad_lines=None, on_bad_lines=None, delim_whitespace=False, low_memory=True, memory_map=False, float_precision=None, storage_options: 'StorageOptions' = None, ) Docstring: Read a comma-separated values (csv) file into DataFrame. Also supports optionally iterating or breaking of the file into chunks. Additional help can be found in the online docs for `IO Tools <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html>`_. Parameters ---------- filepath_or_buffer : str, path object or file-like object Any valid string path is acceptable. The string could be a URL. Valid URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is expected. A local file could be: file://localhost/path/to/table.csv. If you want to pass in a path object, pandas accepts any ``os.PathLike``. By file-like object, we refer to objects with a ``read()`` method, such as a file handle (e.g. via builtin ``open`` function) or ``StringIO``. sep : str, default ',' Delimiter to use. If sep is None, the C engine cannot automatically detect the separator, but the Python parsing engine can, meaning the latter will be used and automatically detect the separator by Python's builtin sniffer tool, ``csv.Sniffer``. In addition, separators longer than 1 character and different from ``'\s+'`` will be interpreted as regular expressions and will also force the use of the Python parsing engine. Note that regex delimiters are prone to ignoring quoted data. Regex example: ``'\r\t'``. delimiter : str, default ``None`` Alias for sep. header : int, list of int, None, default 'infer' Row number(s) to use as the column names, and the start of the data. Default behavior is to infer the column names: if no names are passed the behavior is identical to ``header=0`` and column names are inferred from the first line of the file, if column names are passed explicitly then the behavior is identical to ``header=None``. Explicitly pass ``header=0`` to be able to replace existing names. The header can be a list of integers that specify row locations for a multi-index on the columns e.g. [0,1,3]. Intervening rows that are not specified will be skipped (e.g. 2 in this example is skipped). Note that this parameter ignores commented lines and empty lines if ``skip_blank_lines=True``, so ``header=0`` denotes the first line of data rather than the first line of the file. names : array-like, optional List of column names to use. If the file contains a header row, then you should explicitly pass ``header=0`` to override the column names. Duplicates in this list are not allowed. index_col : int, str, sequence of int / str, or False, optional, default ``None`` Column(s) to use as the row labels of the ``DataFrame``, either given as string name or column index. If a sequence of int / str is given, a MultiIndex is used. Note: ``index_col=False`` can be used to force pandas to *not* use the first column as the index, e.g. when you have a malformed file with delimiters at the end of each line. usecols : list-like or callable, optional Return a subset of the columns. If list-like, all elements must either be positional (i.e. integer indices into the document columns) or strings that correspond to column names provided either by the user in `names` or inferred from the document header row(s). If ``names`` are given, the document header row(s) are not taken into account. For example, a valid list-like `usecols` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``. Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. To instantiate a DataFrame from ``data`` with element order preserved use ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` for columns in ``['foo', 'bar']`` order or ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]`` for ``['bar', 'foo']`` order. If callable, the callable function will be evaluated against the column names, returning names where the callable function evaluates to True. An example of a valid callable argument would be ``lambda x: x.upper() in ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster parsing time and lower memory usage. squeeze : bool, default False If the parsed data only contains one column then return a Series. .. deprecated:: 1.4.0 Append ``.squeeze("columns")`` to the call to ``read_csv`` to squeeze the data. prefix : str, optional Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ... .. deprecated:: 1.4.0 Use a list comprehension on the DataFrame's columns after calling ``read_csv``. mangle_dupe_cols : bool, default True Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than 'X'...'X'. Passing in False will cause data to be overwritten if there are duplicate names in the columns. dtype : Type name or dict of column -> type, optional Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32, 'c': 'Int64'} Use `str` or `object` together with suitable `na_values` settings to preserve and not interpret dtype. If converters are specified, they will be applied INSTEAD of dtype conversion. engine : {'c', 'python', 'pyarrow'}, optional Parser engine to use. The C and pyarrow engines are faster, while the python engine is currently more feature-complete. Multithreading is currently only supported by the pyarrow engine. .. versionadded:: 1.4.0 The "pyarrow" engine was added as an *experimental* engine, and some features are unsupported, or may not work correctly, with this engine. converters : dict, optional Dict of functions for converting values in certain columns. Keys can either be integers or column labels. true_values : list, optional Values to consider as True. false_values : list, optional Values to consider as False. skipinitialspace : bool, default False Skip spaces after delimiter. skiprows : list-like, int or callable, optional Line numbers to skip (0-indexed) or number of lines to skip (int) at the start of the file. If callable, the callable function will be evaluated against the row indices, returning True if the row should be skipped and False otherwise. An example of a valid callable argument would be ``lambda x: x in [0, 2]``. skipfooter : int, default 0 Number of lines at bottom of file to skip (Unsupported with engine='c'). nrows : int, optional Number of rows of file to read. Useful for reading pieces of large files. na_values : scalar, str, list-like, or dict, optional Additional strings to recognize as NA/NaN. If dict passed, specific per-column NA values. By default the following values are interpreted as NaN: '', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND', '1.#QNAN', '<NA>', 'N/A', 'NA', 'NULL', 'NaN', 'n/a', 'nan', 'null'. keep_default_na : bool, default True Whether or not to include the default NaN values when parsing the data. Depending on whether `na_values` is passed in, the behavior is as follows: * If `keep_default_na` is True, and `na_values` are specified, `na_values` is appended to the default NaN values used for parsing. * If `keep_default_na` is True, and `na_values` are not specified, only the default NaN values are used for parsing. * If `keep_default_na` is False, and `na_values` are specified, only the NaN values specified `na_values` are used for parsing. * If `keep_default_na` is False, and `na_values` are not specified, no strings will be parsed as NaN. Note that if `na_filter` is passed in as False, the `keep_default_na` and `na_values` parameters will be ignored. na_filter : bool, default True Detect missing value markers (empty strings and the value of na_values). In data without any NAs, passing na_filter=False can improve the performance of reading a large file. verbose : bool, default False Indicate number of NA values placed in non-numeric columns. skip_blank_lines : bool, default True If True, skip over blank lines rather than interpreting as NaN values. parse_dates : bool or list of int or names or list of lists or dict, default False The behavior is as follows: * boolean. If True -> try parsing the index. * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3 each as a separate date column. * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as a single date column. * dict, e.g. {'foo' : [1, 3]} -> parse columns 1, 3 as date and call result 'foo' If a column or index cannot be represented as an array of datetimes, say because of an unparsable value or a mixture of timezones, the column or index will be returned unaltered as an object data type. For non-standard datetime parsing, use ``pd.to_datetime`` after ``pd.read_csv``. To parse an index or column with a mixture of timezones, specify ``date_parser`` to be a partially-applied :func:`pandas.to_datetime` with ``utc=True``. See :ref:`io.csv.mixed_timezones` for more. Note: A fast-path exists for iso8601-formatted dates. infer_datetime_format : bool, default False If True and `parse_dates` is enabled, pandas will attempt to infer the format of the datetime strings in the columns, and if it can be inferred, switch to a faster method of parsing them. In some cases this can increase the parsing speed by 5-10x. keep_date_col : bool, default False If True and `parse_dates` specifies combining multiple columns then keep the original columns. date_parser : function, optional Function to use for converting a sequence of string columns to an array of datetime instances. The default uses ``dateutil.parser.parser`` to do the conversion. Pandas will try to call `date_parser` in three different ways, advancing to the next if an exception occurs: 1) Pass one or more arrays (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the string values from the columns defined by `parse_dates` into a single array and pass that; and 3) call `date_parser` once for each row using one or more strings (corresponding to the columns defined by `parse_dates`) as arguments. dayfirst : bool, default False DD/MM format dates, international and European format. cache_dates : bool, default True If True, use a cache of unique, converted dates to apply the datetime conversion. May produce significant speed-up when parsing duplicate date strings, especially ones with timezone offsets. .. versionadded:: 0.25.0 iterator : bool, default False Return TextFileReader object for iteration or getting chunks with ``get_chunk()``. .. versionchanged:: 1.2 ``TextFileReader`` is a context manager. chunksize : int, optional Return TextFileReader object for iteration. See the `IO Tools docs <https://pandas.pydata.org/pandas-docs/stable/io.html#io-chunking>`_ for more information on ``iterator`` and ``chunksize``. .. versionchanged:: 1.2 ``TextFileReader`` is a context manager. compression : str or dict, default 'infer' For on-the-fly decompression of on-disk data. If 'infer' and '%s' is path-like, then detect compression from the following extensions: '.gz', '.bz2', '.zip', '.xz', or '.zst' (otherwise no compression). If using 'zip', the ZIP file must contain only one data file to be read in. Set to ``None`` for no decompression. Can also be a dict with key ``'method'`` set to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``} and other key-value pairs are forwarded to ``zipfile.ZipFile``, ``gzip.GzipFile``, ``bz2.BZ2File``, or ``zstandard.ZstdDecompressor``, respectively. As an example, the following could be passed for Zstandard decompression using a custom compression dictionary: ``compression={'method': 'zstd', 'dict_data': my_compression_dict}``. .. versionchanged:: 1.4.0 Zstandard support. thousands : str, optional Thousands separator. decimal : str, default '.' Character to recognize as decimal point (e.g. use ',' for European data). lineterminator : str (length 1), optional Character to break file into lines. Only valid with C parser. quotechar : str (length 1), optional The character used to denote the start and end of a quoted item. Quoted items can include the delimiter and it will be ignored. quoting : int or csv.QUOTE_* instance, default 0 Control field quoting behavior per ``csv.QUOTE_*`` constants. Use one of QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3). doublequote : bool, default ``True`` When quotechar is specified and quoting is not ``QUOTE_NONE``, indicate whether or not to interpret two consecutive quotechar elements INSIDE a field as a single ``quotechar`` element. escapechar : str (length 1), optional One-character string used to escape other characters. comment : str, optional Indicates remainder of line should not be parsed. If found at the beginning of a line, the line will be ignored altogether. This parameter must be a single character. Like empty lines (as long as ``skip_blank_lines=True``), fully commented lines are ignored by the parameter `header` but not by `skiprows`. For example, if ``comment='#'``, parsing ``#empty\na,b,c\n1,2,3`` with ``header=0`` will result in 'a,b,c' being treated as the header. encoding : str, optional Encoding to use for UTF when reading/writing (ex. 'utf-8'). `List of Python standard encodings <https://docs.python.org/3/library/codecs.html#standard-encodings>`_ . .. versionchanged:: 1.2 When ``encoding`` is ``None``, ``errors="replace"`` is passed to ``open()``. Otherwise, ``errors="strict"`` is passed to ``open()``. This behavior was previously only the case for ``engine="python"``. .. versionchanged:: 1.3.0 ``encoding_errors`` is a new argument. ``encoding`` has no longer an influence on how encoding errors are handled. encoding_errors : str, optional, default "strict" How encoding errors are treated. `List of possible values <https://docs.python.org/3/library/codecs.html#error-handlers>`_ . .. versionadded:: 1.3.0 dialect : str or csv.Dialect, optional If provided, this parameter will override values (default or not) for the following parameters: `delimiter`, `doublequote`, `escapechar`, `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to override values, a ParserWarning will be issued. See csv.Dialect documentation for more details. error_bad_lines : bool, optional, default ``None`` Lines with too many fields (e.g. a csv line with too many commas) will by default cause an exception to be raised, and no DataFrame will be returned. If False, then these "bad lines" will be dropped from the DataFrame that is returned. .. deprecated:: 1.3.0 The ``on_bad_lines`` parameter should be used instead to specify behavior upon encountering a bad line instead. warn_bad_lines : bool, optional, default ``None`` If error_bad_lines is False, and warn_bad_lines is True, a warning for each "bad line" will be output. .. deprecated:: 1.3.0 The ``on_bad_lines`` parameter should be used instead to specify behavior upon encountering a bad line instead. on_bad_lines : {'error', 'warn', 'skip'} or callable, default 'error' Specifies what to do upon encountering a bad line (a line with too many fields). Allowed values are : - 'error', raise an Exception when a bad line is encountered. - 'warn', raise a warning when a bad line is encountered and skip that line. - 'skip', skip bad lines without raising or warning when they are encountered. .. versionadded:: 1.3.0 .. versionadded:: 1.4.0 - callable, function with signature ``(bad_line: list[str]) -> list[str] | None`` that will process a single bad line. ``bad_line`` is a list of strings split by the ``sep``. If the function returns ``None``, the bad line will be ignored. If the function returns a new list of strings with more elements than expected, a ``ParserWarning`` will be emitted while dropping extra elements. Only supported when ``engine="python"`` delim_whitespace : bool, default False Specifies whether or not whitespace (e.g. ``' '`` or ``' '``) will be used as the sep. Equivalent to setting ``sep='\s+'``. If this option is set to True, nothing should be passed in for the ``delimiter`` parameter. low_memory : bool, default True Internally process the file in chunks, resulting in lower memory use while parsing, but possibly mixed type inference. To ensure no mixed types either set False, or specify the type with the `dtype` parameter. Note that the entire file is read into a single DataFrame regardless, use the `chunksize` or `iterator` parameter to return the data in chunks. (Only valid with C parser). memory_map : bool, default False If a filepath is provided for `filepath_or_buffer`, map the file object directly onto memory and access the data directly from there. Using this option can improve performance because there is no longer any I/O overhead. float_precision : str, optional Specifies which converter the C engine should use for floating-point values. The options are ``None`` or 'high' for the ordinary converter, 'legacy' for the original lower precision pandas converter, and 'round_trip' for the round-trip converter. .. versionchanged:: 1.2 storage_options : dict, optional Extra options that make sense for a particular storage connection, e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value pairs are forwarded to ``urllib`` as header options. For other URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are forwarded to ``fsspec``. Please see ``fsspec`` and ``urllib`` for more details. .. versionadded:: 1.2 Returns ------- DataFrame or TextParser A comma-separated values (csv) file is returned as two-dimensional data structure with labeled axes. See Also -------- DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file. read_csv : Read a comma-separated values (csv) file into DataFrame. read_fwf : Read a table of fixed-width formatted lines into DataFrame. Examples -------- >>> pd.read_csv('data.csv') # doctest: +SKIP Source: @deprecate_nonkeyword_arguments(version=None, allowed_args=["filepath_or_buffer"]) @Appender( _doc_read_csv_and_table.format( func_name="read_csv", summary="Read a comma-separated values (csv) file into DataFrame.", _default_sep="','", storage_options=_shared_docs["storage_options"], decompression_options=_shared_docs["decompression_options"], ) ) def read_csv( filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], sep=lib.no_default, delimiter=None, # Column and Index Locations and Names header="infer", names=lib.no_default, index_col=None, usecols=None, squeeze=None, prefix=lib.no_default, mangle_dupe_cols=True, # General Parsing Configuration dtype: DtypeArg | None = None, engine: CSVEngine | None = None, converters=None, true_values=None, false_values=None, skipinitialspace=False, skiprows=None, skipfooter=0, nrows=None, # NA and Missing Data Handling na_values=None, keep_default_na=True, na_filter=True, verbose=False, skip_blank_lines=True, # Datetime Handling parse_dates=None, infer_datetime_format=False, keep_date_col=False, date_parser=None, dayfirst=False, cache_dates=True, # Iteration iterator=False, chunksize=None, # Quoting, Compression, and File Format compression: CompressionOptions = "infer", thousands=None, decimal: str = ".", lineterminator=None, quotechar='"', quoting=csv.QUOTE_MINIMAL, doublequote=True, escapechar=None, comment=None, encoding=None, encoding_errors: str | None = "strict", dialect=None, # Error Handling error_bad_lines=None, warn_bad_lines=None, # TODO(2.0): set on_bad_lines to "error". # See _refine_defaults_read comment for why we do this. on_bad_lines=None, # Internal delim_whitespace=False, low_memory=_c_parser_defaults["low_memory"], memory_map=False, float_precision=None, storage_options: StorageOptions = None, ): # locals() should never be modified kwds = locals().copy() del kwds["filepath_or_buffer"] del kwds["sep"] kwds_defaults = _refine_defaults_read( dialect, delimiter, delim_whitespace, engine, sep, error_bad_lines, warn_bad_lines, on_bad_lines, names, prefix, defaults={"delimiter": ","}, ) kwds.update(kwds_defaults) return _read(filepath_or_buffer, kwds) File: c:\users\jagui\anaconda3\lib\site-packages\pandas\io\parsers\readers.py Type: function
Como tengo un dataset muy grande, hago una partición pequeña para poder trabajar de forma rápida con ella, y finalmente, cuando tenga definidos todos los procesos, los aplico al datset entero
## Función de lectura para las columnas que están en formato json
columns = ['device', 'geoNetwork', 'totals'] # Columnas con datos en formato json
random.seed(10) # Nos aseguramos de cargar siempre la misma partición aleatoria (tiene la misma función que el random_state del DecisionTree)
p = 0.1 # Usamos el 10% del dataset
def json_read(df):
data_frame = file_dir + df
# Leo el csv con una proporción "p"
df = pd.read_csv(data_frame,
converters = {column: json.loads for column in columns}, # cargo las columnas que tienen formato json
dtype = {'fullVisitorId': 'str'}, # La pongo como string (sino me la pone numérica)
skiprows = lambda i: i>0 and random.random() > p) # Lee el porcentaje "p" del total de las variables
# Desjeisonizo las columnas
for column in columns: # grupos
column_as_df = json_normalize(df[column]) # donde había una celda con diferentes valores, los separa en columnas distintas (subgrupos)
column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns] # Renombro las columnas distintas (grupo.subgrupo)
df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True) # Elimino las columnas con formato json y junto todo
print(f"Loaded {os.path.basename(data_frame)}. Shape: {df.shape}")
return df
Cargamos el dataset y le pasamos el comando mágico %%, añadiendo time para saber el tiempo de carga
%%time
df = json_read(df = file_name)
Loaded GACR.csv. Shape: (90306, 42) Wall time: 37.2 s
Comprobamos que no haya ningún registro duplicado
drop_duplicates(df)
(90306, 42) Se han eliminado 0 registros repetidos (90306, 42)
| Unnamed: 0 | channelGrouping | date | fullVisitorId | sessionId | socialEngagementType | visitId | visitNumber | visitStartTime | device.browser | ... | geoNetwork.networkDomain | geoNetwork.latitude | geoNetwork.longitude | geoNetwork.networkLocation | totals.visits | totals.hits | totals.pageviews | totals.bounces | totals.newVisits | totals.transactionRevenue |
|---|
0 rows × 42 columns
A.- Data Size
# Tamaño del datset
df.size
3792852
# Número de filas y columnas
df.shape
(90306, 42)
# Información resumida del dataset
df.info(verbose=False)
<class 'pandas.core.frame.DataFrame'> Int64Index: 90306 entries, 0 to 90305 Columns: 42 entries, Unnamed: 0 to totals.transactionRevenue dtypes: bool(1), int64(5), object(36) memory usage: 29.0+ MB
Tenemos 36 variables objeto que tendremos que pasar a numéricas. El deadline del proyecto estará entorno a 2-3 semanas
B.- Visualización directa de los datos
# Columnas del dataset
df.columns
Index(['Unnamed: 0', 'channelGrouping', 'date', 'fullVisitorId', 'sessionId',
'socialEngagementType', 'visitId', 'visitNumber', 'visitStartTime',
'device.browser', 'device.browserVersion', 'device.browserSize',
'device.operatingSystem', 'device.operatingSystemVersion',
'device.isMobile', 'device.mobileDeviceBranding',
'device.mobileDeviceModel', 'device.mobileInputSelector',
'device.mobileDeviceInfo', 'device.mobileDeviceMarketingName',
'device.flashVersion', 'device.language', 'device.screenColors',
'device.screenResolution', 'device.deviceCategory',
'geoNetwork.continent', 'geoNetwork.subContinent', 'geoNetwork.country',
'geoNetwork.region', 'geoNetwork.metro', 'geoNetwork.city',
'geoNetwork.cityId', 'geoNetwork.networkDomain', 'geoNetwork.latitude',
'geoNetwork.longitude', 'geoNetwork.networkLocation', 'totals.visits',
'totals.hits', 'totals.pageviews', 'totals.bounces', 'totals.newVisits',
'totals.transactionRevenue'],
dtype='object')
# 5 primeros registros
df.head().T
| 0 | 1 | 2 | 3 | 4 | |
|---|---|---|---|---|---|
| Unnamed: 0 | 13 | 27 | 28 | 37 | 43 |
| channelGrouping | Organic Search | Organic Search | Referral | Organic Search | Organic Search |
| date | 20160902 | 20160902 | 20160902 | 20160902 | 20160902 |
| fullVisitorId | 1438082600262726746 | 1283542838194038522 | 4339756682310369249 | 062441254657008214 | 1381975521299261523 |
| sessionId | 1438082600262726746_1472803483 | 1283542838194038522_1472885255 | 4339756682310369249_1472828340 | 062441254657008214_1472875520 | 1381975521299261523_1472829727 |
| socialEngagementType | Not Socially Engaged | Not Socially Engaged | Not Socially Engaged | Not Socially Engaged | Not Socially Engaged |
| visitId | 1472803483 | 1472885255 | 1472828340 | 1472875520 | 1472829727 |
| visitNumber | 1 | 1 | 1 | 1 | 1 |
| visitStartTime | 1472803483 | 1472885255 | 1472828340 | 1472875520 | 1472829727 |
| device.browser | Safari | Safari | Chrome | Chrome | Chrome |
| device.browserVersion | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset |
| device.browserSize | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset |
| device.operatingSystem | iOS | Macintosh | Android | Windows | Macintosh |
| device.operatingSystemVersion | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset |
| device.isMobile | True | False | True | False | False |
| device.mobileDeviceBranding | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset |
| device.mobileDeviceModel | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset |
| device.mobileInputSelector | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset |
| device.mobileDeviceInfo | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset |
| device.mobileDeviceMarketingName | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset |
| device.flashVersion | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset |
| device.language | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset |
| device.screenColors | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset |
| device.screenResolution | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset |
| device.deviceCategory | mobile | desktop | mobile | desktop | desktop |
| geoNetwork.continent | Asia | Europe | Asia | Oceania | Asia |
| geoNetwork.subContinent | Southern Asia | Eastern Europe | Southern Asia | Australasia | Eastern Asia |
| geoNetwork.country | Pakistan | Hungary | India | Australia | South Korea |
| geoNetwork.region | Sindh | not available in demo dataset | Karnataka | not available in demo dataset | Seoul |
| geoNetwork.metro | (not set) | not available in demo dataset | (not set) | not available in demo dataset | (not set) |
| geoNetwork.city | Karachi | not available in demo dataset | Bengaluru | not available in demo dataset | Seoul |
| geoNetwork.cityId | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset |
| geoNetwork.networkDomain | unknown.unknown | broadband.hu | unknown.unknown | uwa.edu.au | unknown.unknown |
| geoNetwork.latitude | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset |
| geoNetwork.longitude | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset |
| geoNetwork.networkLocation | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset |
| totals.visits | 1 | 1 | 1 | 1 | 1 |
| totals.hits | 1 | 1 | 1 | 1 | 1 |
| totals.pageviews | 1 | 1 | 1 | 1 | 1 |
| totals.bounces | 1 | 1 | 1 | 1 | 1 |
| totals.newVisits | 1 | 1 | 1 | 1 | 1 |
| totals.transactionRevenue | NaN | NaN | NaN | NaN | NaN |
# 5 registros aleatorios
df.sample(5).T
| 18782 | 1328 | 52218 | 88031 | 47217 | |
|---|---|---|---|---|---|
| Unnamed: 0 | 189961 | 13239 | 522357 | 881452 | 473156 |
| channelGrouping | Social | Social | Organic Search | Organic Search | Organic Search |
| date | 20160901 | 20160811 | 20170603 | 20161019 | 20170222 |
| fullVisitorId | 6123965036917167631 | 4895187008328298603 | 4076463815754499010 | 816471424120903829 | 5310502070074179206 |
| sessionId | 6123965036917167631_1472724028 | 4895187008328298603_1470977163 | 4076463815754499010_1496487499 | 0816471424120903829_1476934293 | 5310502070074179206_1487756304 |
| socialEngagementType | Not Socially Engaged | Not Socially Engaged | Not Socially Engaged | Not Socially Engaged | Not Socially Engaged |
| visitId | 1472724028 | 1470977163 | 1496487499 | 1476934293 | 1487756304 |
| visitNumber | 1 | 1 | 1 | 1 | 1 |
| visitStartTime | 1472724028 | 1470977163 | 1496487499 | 1476934293 | 1487756304 |
| device.browser | Safari | Chrome | Chrome | Chrome | Chrome |
| device.browserVersion | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset |
| device.browserSize | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset |
| device.operatingSystem | Macintosh | Windows | Android | Android | Windows |
| device.operatingSystemVersion | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset |
| device.isMobile | False | False | True | True | False |
| device.mobileDeviceBranding | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset |
| device.mobileDeviceModel | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset |
| device.mobileInputSelector | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset |
| device.mobileDeviceInfo | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset |
| device.mobileDeviceMarketingName | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset |
| device.flashVersion | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset |
| device.language | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset |
| device.screenColors | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset |
| device.screenResolution | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset |
| device.deviceCategory | desktop | desktop | tablet | mobile | desktop |
| geoNetwork.continent | Asia | Oceania | Europe | Asia | Europe |
| geoNetwork.subContinent | Southern Asia | Australasia | Western Europe | Southern Asia | Western Europe |
| geoNetwork.country | India | Australia | France | India | Netherlands |
| geoNetwork.region | not available in demo dataset | New South Wales | not available in demo dataset | Telangana | not available in demo dataset |
| geoNetwork.metro | not available in demo dataset | (not set) | not available in demo dataset | (not set) | not available in demo dataset |
| geoNetwork.city | not available in demo dataset | Sydney | not available in demo dataset | Hyderabad | not available in demo dataset |
| geoNetwork.cityId | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset |
| geoNetwork.networkDomain | unknown.unknown | aapt.com.au | (not set) | (not set) | ziggozakelijk.nl |
| geoNetwork.latitude | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset |
| geoNetwork.longitude | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset |
| geoNetwork.networkLocation | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset |
| totals.visits | 1 | 1 | 1 | 1 | 1 |
| totals.hits | 1 | 1 | 2 | 2 | 1 |
| totals.pageviews | 1 | 1 | 2 | 2 | 1 |
| totals.bounces | 1 | 1 | NaN | NaN | 1 |
| totals.newVisits | 1 | 1 | 1 | 1 | 1 |
| totals.transactionRevenue | NaN | NaN | NaN | NaN | NaN |
# 5 últimos registros
df.tail().T
| 90301 | 90302 | 90303 | 90304 | 90305 | |
|---|---|---|---|---|---|
| Unnamed: 0 | 903617 | 903623 | 903642 | 903647 | 903648 |
| channelGrouping | Social | Social | Social | Social | Social |
| date | 20170104 | 20170104 | 20170104 | 20170104 | 20170104 |
| fullVisitorId | 8316382343226738015 | 6636384798982309878 | 469840327005431380 | 2140149974339316233 | 5123779100307500332 |
| sessionId | 8316382343226738015_1483549157 | 6636384798982309878_1483525429 | 469840327005431380_1483573235 | 2140149974339316233_1483557808 | 5123779100307500332_1483554750 |
| socialEngagementType | Not Socially Engaged | Not Socially Engaged | Not Socially Engaged | Not Socially Engaged | Not Socially Engaged |
| visitId | 1483549157 | 1483525429 | 1483573235 | 1483557808 | 1483554750 |
| visitNumber | 1 | 1 | 1 | 1 | 1 |
| visitStartTime | 1483549157 | 1483525429 | 1483573235 | 1483557808 | 1483554750 |
| device.browser | Chrome | Chrome | Internet Explorer | Chrome | Chrome |
| device.browserVersion | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset |
| device.browserSize | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset |
| device.operatingSystem | Macintosh | Windows | Windows | Windows | Windows |
| device.operatingSystemVersion | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset |
| device.isMobile | False | False | False | False | False |
| device.mobileDeviceBranding | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset |
| device.mobileDeviceModel | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset |
| device.mobileInputSelector | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset |
| device.mobileDeviceInfo | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset |
| device.mobileDeviceMarketingName | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset |
| device.flashVersion | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset |
| device.language | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset |
| device.screenColors | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset |
| device.screenResolution | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset |
| device.deviceCategory | desktop | desktop | desktop | desktop | desktop |
| geoNetwork.continent | Europe | Asia | Europe | Africa | Americas |
| geoNetwork.subContinent | Western Europe | Western Asia | Western Europe | Northern Africa | Caribbean |
| geoNetwork.country | France | Turkey | Germany | Egypt | Puerto Rico |
| geoNetwork.region | Auvergne-Rhone-Alpes | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset |
| geoNetwork.metro | (not set) | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset |
| geoNetwork.city | Lyon | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset |
| geoNetwork.cityId | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset |
| geoNetwork.networkDomain | completel.net | unknown.unknown | (not set) | tedata.net | prtc.net |
| geoNetwork.latitude | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset |
| geoNetwork.longitude | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset |
| geoNetwork.networkLocation | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset |
| totals.visits | 1 | 1 | 1 | 1 | 1 |
| totals.hits | 3 | 4 | 9 | 16 | 17 |
| totals.pageviews | 3 | 4 | 9 | 11 | 15 |
| totals.bounces | NaN | NaN | NaN | NaN | NaN |
| totals.newVisits | 1 | 1 | 1 | 1 | 1 |
| totals.transactionRevenue | NaN | NaN | NaN | NaN | NaN |
Cada fila del dataset va a ser una sesión, una misma persona puede tener varias sesiones. Por lo tanto tengo un identificador único que es el sessionId, que es único para cada fila. Por lo tanto esta variable no me va a ayudar a nivel de predicción.
Lo vamos a mantener por si lo necesitamos para hacer joins o cualquier otra operación que nos sirva de nexo de unión.
Información del dataset
En Google Analytics, en las adquisiciones tenemos la fuente (google, bing, yahoo, ...) y el medio mediante el cual el usuario accede a mi web (trafico de pago o CPC, tráfico orgánico o no forzado, tráfico directo, tráfico referido o que accede a mi web desde otros sitios que no son búsquedas, .... )
# Metemos como índice del dataset del sessionID
df.set_index('sessionId', inplace=True)
df.head(1)
| Unnamed: 0 | channelGrouping | date | fullVisitorId | socialEngagementType | visitId | visitNumber | visitStartTime | device.browser | device.browserVersion | ... | geoNetwork.networkDomain | geoNetwork.latitude | geoNetwork.longitude | geoNetwork.networkLocation | totals.visits | totals.hits | totals.pageviews | totals.bounces | totals.newVisits | totals.transactionRevenue | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| sessionId | |||||||||||||||||||||
| 1438082600262726746_1472803483 | 13 | Organic Search | 20160902 | 1438082600262726746 | Not Socially Engaged | 1472803483 | 1 | 1472803483 | Safari | not available in demo dataset | ... | unknown.unknown | not available in demo dataset | not available in demo dataset | not available in demo dataset | 1 | 1 | 1 | 1 | 1 | NaN |
1 rows × 41 columns
# Borramos los otros ids (el session_Id contiene estos ids)
df_ids = ['fullVisitorId', 'visitId']
df.drop(df_ids, axis=1, inplace=True)
df.head(1)
| Unnamed: 0 | channelGrouping | date | socialEngagementType | visitNumber | visitStartTime | device.browser | device.browserVersion | device.browserSize | device.operatingSystem | ... | geoNetwork.networkDomain | geoNetwork.latitude | geoNetwork.longitude | geoNetwork.networkLocation | totals.visits | totals.hits | totals.pageviews | totals.bounces | totals.newVisits | totals.transactionRevenue | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| sessionId | |||||||||||||||||||||
| 1438082600262726746_1472803483 | 13 | Organic Search | 20160902 | Not Socially Engaged | 1 | 1472803483 | Safari | not available in demo dataset | not available in demo dataset | iOS | ... | unknown.unknown | not available in demo dataset | not available in demo dataset | not available in demo dataset | 1 | 1 | 1 | 1 | 1 | NaN |
1 rows × 39 columns
C.- Tipo de atributos disponibles
Vemos si los atributos de nuestro dataframe son numéricos o categóricos. Se clasifican, de menor a mayor espacio que ocupan, en:
Numeros (continuas)
Categoricas (discretas)
df.info(verbose = True)
<class 'pandas.core.frame.DataFrame'> Index: 90306 entries, 1438082600262726746_1472803483 to 5123779100307500332_1483554750 Data columns (total 39 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 90306 non-null int64 1 channelGrouping 90306 non-null object 2 date 90306 non-null int64 3 socialEngagementType 90306 non-null object 4 visitNumber 90306 non-null int64 5 visitStartTime 90306 non-null int64 6 device.browser 90306 non-null object 7 device.browserVersion 90306 non-null object 8 device.browserSize 90306 non-null object 9 device.operatingSystem 90306 non-null object 10 device.operatingSystemVersion 90306 non-null object 11 device.isMobile 90306 non-null bool 12 device.mobileDeviceBranding 90306 non-null object 13 device.mobileDeviceModel 90306 non-null object 14 device.mobileInputSelector 90306 non-null object 15 device.mobileDeviceInfo 90306 non-null object 16 device.mobileDeviceMarketingName 90306 non-null object 17 device.flashVersion 90306 non-null object 18 device.language 90306 non-null object 19 device.screenColors 90306 non-null object 20 device.screenResolution 90306 non-null object 21 device.deviceCategory 90306 non-null object 22 geoNetwork.continent 90306 non-null object 23 geoNetwork.subContinent 90306 non-null object 24 geoNetwork.country 90306 non-null object 25 geoNetwork.region 90306 non-null object 26 geoNetwork.metro 90306 non-null object 27 geoNetwork.city 90306 non-null object 28 geoNetwork.cityId 90306 non-null object 29 geoNetwork.networkDomain 90306 non-null object 30 geoNetwork.latitude 90306 non-null object 31 geoNetwork.longitude 90306 non-null object 32 geoNetwork.networkLocation 90306 non-null object 33 totals.visits 90306 non-null object 34 totals.hits 90306 non-null object 35 totals.pageviews 90296 non-null object 36 totals.bounces 44973 non-null object 37 totals.newVisits 70392 non-null object 38 totals.transactionRevenue 1110 non-null object dtypes: bool(1), int64(4), object(34) memory usage: 27.0+ MB
D.- Estadísticos descriptivos básicos y distribución de los nulos
Mediante el método describe(), podemos obtener los estadísticos representativos de cada uno de los atributos del DataFrame.
# Atributos numéricos
df.describe(include=np.number).T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Unnamed: 0 | 90306.0 | 4.531874e+05 | 2.605409e+05 | 1.300000e+01 | 2.280235e+05 | 4.528250e+05 | 6.791640e+05 | 9.036480e+05 |
| date | 90306.0 | 2.016589e+07 | 4.697568e+03 | 2.016080e+07 | 2.016103e+07 | 2.017011e+07 | 2.017042e+07 | 2.017080e+07 |
| visitNumber | 90306.0 | 2.254269e+00 | 9.102378e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 3.730000e+02 |
| visitStartTime | 90306.0 | 1.485020e+09 | 9.027413e+06 | 1.470035e+09 | 1.477570e+09 | 1.483985e+09 | 1.492787e+09 | 1.501656e+09 |
# Atributos categoricos
df.describe(exclude=np.number).T
| count | unique | top | freq | |
|---|---|---|---|---|
| channelGrouping | 90306 | 8 | Organic Search | 38445 |
| socialEngagementType | 90306 | 1 | Not Socially Engaged | 90306 |
| device.browser | 90306 | 28 | Chrome | 62147 |
| device.browserVersion | 90306 | 1 | not available in demo dataset | 90306 |
| device.browserSize | 90306 | 1 | not available in demo dataset | 90306 |
| device.operatingSystem | 90306 | 17 | Windows | 35174 |
| device.operatingSystemVersion | 90306 | 1 | not available in demo dataset | 90306 |
| device.isMobile | 90306 | 2 | False | 66571 |
| device.mobileDeviceBranding | 90306 | 1 | not available in demo dataset | 90306 |
| device.mobileDeviceModel | 90306 | 1 | not available in demo dataset | 90306 |
| device.mobileInputSelector | 90306 | 1 | not available in demo dataset | 90306 |
| device.mobileDeviceInfo | 90306 | 1 | not available in demo dataset | 90306 |
| device.mobileDeviceMarketingName | 90306 | 1 | not available in demo dataset | 90306 |
| device.flashVersion | 90306 | 1 | not available in demo dataset | 90306 |
| device.language | 90306 | 1 | not available in demo dataset | 90306 |
| device.screenColors | 90306 | 1 | not available in demo dataset | 90306 |
| device.screenResolution | 90306 | 1 | not available in demo dataset | 90306 |
| device.deviceCategory | 90306 | 3 | desktop | 66572 |
| geoNetwork.continent | 90306 | 6 | Americas | 44890 |
| geoNetwork.subContinent | 90306 | 23 | Northern America | 38911 |
| geoNetwork.country | 90306 | 189 | United States | 36335 |
| geoNetwork.region | 90306 | 332 | not available in demo dataset | 50639 |
| geoNetwork.metro | 90306 | 86 | not available in demo dataset | 50639 |
| geoNetwork.city | 90306 | 542 | not available in demo dataset | 50639 |
| geoNetwork.cityId | 90306 | 1 | not available in demo dataset | 90306 |
| geoNetwork.networkDomain | 90306 | 7394 | (not set) | 24320 |
| geoNetwork.latitude | 90306 | 1 | not available in demo dataset | 90306 |
| geoNetwork.longitude | 90306 | 1 | not available in demo dataset | 90306 |
| geoNetwork.networkLocation | 90306 | 1 | not available in demo dataset | 90306 |
| totals.visits | 90306 | 1 | 1 | 90306 |
| totals.hits | 90306 | 161 | 1 | 44587 |
| totals.pageviews | 90296 | 125 | 1 | 45150 |
| totals.bounces | 44973 | 1 | 1 | 44973 |
| totals.newVisits | 70392 | 1 | 1 | 70392 |
| totals.transactionRevenue | 1110 | 820 | 33590000 | 25 |
Variables de baja varianza
Son variables que van a ser irrelevantes para el modelo (no le van a aportar información relevante).
Se eliminan aquellas variables numéricas cuya varianza sea inferior a 0.15
varianza_nula (dataframe = df, std = 0.15)
Se han borrado las siguientes variables numéricas por tener baja varianza: []
| Unnamed: 0 | channelGrouping | date | socialEngagementType | visitNumber | visitStartTime | device.browser | device.browserVersion | device.browserSize | device.operatingSystem | ... | geoNetwork.networkDomain | geoNetwork.latitude | geoNetwork.longitude | geoNetwork.networkLocation | totals.visits | totals.hits | totals.pageviews | totals.bounces | totals.newVisits | totals.transactionRevenue | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| sessionId | |||||||||||||||||||||
| 1438082600262726746_1472803483 | 13 | Organic Search | 20160902 | Not Socially Engaged | 1 | 1472803483 | Safari | not available in demo dataset | not available in demo dataset | iOS | ... | unknown.unknown | not available in demo dataset | not available in demo dataset | not available in demo dataset | 1 | 1 | 1 | 1 | 1 | NaN |
| 1283542838194038522_1472885255 | 27 | Organic Search | 20160902 | Not Socially Engaged | 1 | 1472885255 | Safari | not available in demo dataset | not available in demo dataset | Macintosh | ... | broadband.hu | not available in demo dataset | not available in demo dataset | not available in demo dataset | 1 | 1 | 1 | 1 | 1 | NaN |
| 4339756682310369249_1472828340 | 28 | Referral | 20160902 | Not Socially Engaged | 1 | 1472828340 | Chrome | not available in demo dataset | not available in demo dataset | Android | ... | unknown.unknown | not available in demo dataset | not available in demo dataset | not available in demo dataset | 1 | 1 | 1 | 1 | 1 | NaN |
| 062441254657008214_1472875520 | 37 | Organic Search | 20160902 | Not Socially Engaged | 1 | 1472875520 | Chrome | not available in demo dataset | not available in demo dataset | Windows | ... | uwa.edu.au | not available in demo dataset | not available in demo dataset | not available in demo dataset | 1 | 1 | 1 | 1 | 1 | NaN |
| 1381975521299261523_1472829727 | 43 | Organic Search | 20160902 | Not Socially Engaged | 1 | 1472829727 | Chrome | not available in demo dataset | not available in demo dataset | Macintosh | ... | unknown.unknown | not available in demo dataset | not available in demo dataset | not available in demo dataset | 1 | 1 | 1 | 1 | 1 | NaN |
5 rows × 39 columns
En este dataset, encontramos 17 variables categóricas sin varianza (aquellas que no tienen nulos y tienen una sóla etiqueta) que tienen un unique=1. Es por ello que la información que le aportan estos atributos al modelo es irrelevante. Las eliminamos.
df.columns
Index(['Unnamed: 0', 'channelGrouping', 'date', 'socialEngagementType',
'visitNumber', 'visitStartTime', 'device.browser',
'device.browserVersion', 'device.browserSize', 'device.operatingSystem',
'device.operatingSystemVersion', 'device.isMobile',
'device.mobileDeviceBranding', 'device.mobileDeviceModel',
'device.mobileInputSelector', 'device.mobileDeviceInfo',
'device.mobileDeviceMarketingName', 'device.flashVersion',
'device.language', 'device.screenColors', 'device.screenResolution',
'device.deviceCategory', 'geoNetwork.continent',
'geoNetwork.subContinent', 'geoNetwork.country', 'geoNetwork.region',
'geoNetwork.metro', 'geoNetwork.city', 'geoNetwork.cityId',
'geoNetwork.networkDomain', 'geoNetwork.latitude',
'geoNetwork.longitude', 'geoNetwork.networkLocation', 'totals.visits',
'totals.hits', 'totals.pageviews', 'totals.bounces', 'totals.newVisits',
'totals.transactionRevenue'],
dtype='object')
# Lista de variables con 1 etiqueta aplicando un for loop
constant_columns = []
for column in df.columns:
if df[column].nunique(dropna = False) == 1: # Especifico False para que me cuente como 2 aquellas variables que vimos con nulos y que vamos a convertir en booleanas
constant_columns.append(column)
constant_columns
['socialEngagementType', 'device.browserVersion', 'device.browserSize', 'device.operatingSystemVersion', 'device.mobileDeviceBranding', 'device.mobileDeviceModel', 'device.mobileInputSelector', 'device.mobileDeviceInfo', 'device.mobileDeviceMarketingName', 'device.flashVersion', 'device.language', 'device.screenColors', 'device.screenResolution', 'geoNetwork.cityId', 'geoNetwork.latitude', 'geoNetwork.longitude', 'geoNetwork.networkLocation', 'totals.visits']
Cuando se hace un for loop y como resultado tengamos una lista, siempre podemos hacer uso de una list comprehension con una sóla línea de código
# Hacemos lo mismo pero usando una list comprehension
constant_columns = [column for column in df.columns if df[column].nunique(dropna = False) == 1]
constant_columns
['socialEngagementType', 'device.browserVersion', 'device.browserSize', 'device.operatingSystemVersion', 'device.mobileDeviceBranding', 'device.mobileDeviceModel', 'device.mobileInputSelector', 'device.mobileDeviceInfo', 'device.mobileDeviceMarketingName', 'device.flashVersion', 'device.language', 'device.screenColors', 'device.screenResolution', 'geoNetwork.cityId', 'geoNetwork.latitude', 'geoNetwork.longitude', 'geoNetwork.networkLocation', 'totals.visits']
# Elimino las variables de la lista anterior
df.drop(constant_columns, axis=1, inplace=True)
df.info(verbose=False)
<class 'pandas.core.frame.DataFrame'> Index: 90306 entries, 1438082600262726746_1472803483 to 5123779100307500332_1483554750 Columns: 21 entries, Unnamed: 0 to totals.transactionRevenue dtypes: bool(1), int64(4), object(16) memory usage: 14.6+ MB
df.head(2).T
| sessionId | 1438082600262726746_1472803483 | 1283542838194038522_1472885255 |
|---|---|---|
| Unnamed: 0 | 13 | 27 |
| channelGrouping | Organic Search | Organic Search |
| date | 20160902 | 20160902 |
| visitNumber | 1 | 1 |
| visitStartTime | 1472803483 | 1472885255 |
| device.browser | Safari | Safari |
| device.operatingSystem | iOS | Macintosh |
| device.isMobile | True | False |
| device.deviceCategory | mobile | desktop |
| geoNetwork.continent | Asia | Europe |
| geoNetwork.subContinent | Southern Asia | Eastern Europe |
| geoNetwork.country | Pakistan | Hungary |
| geoNetwork.region | Sindh | not available in demo dataset |
| geoNetwork.metro | (not set) | not available in demo dataset |
| geoNetwork.city | Karachi | not available in demo dataset |
| geoNetwork.networkDomain | unknown.unknown | broadband.hu |
| totals.hits | 1 | 1 |
| totals.pageviews | 1 | 1 |
| totals.bounces | 1 | 1 |
| totals.newVisits | 1 | 1 |
| totals.transactionRevenue | NaN | NaN |
df_description = df.describe(include='all').T
df_description
| count | unique | top | freq | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Unnamed: 0 | 90306.0 | NaN | NaN | NaN | 453187.353044 | 260540.883169 | 13.0 | 228023.5 | 452825.0 | 679164.0 | 903648.0 |
| channelGrouping | 90306 | 8 | Organic Search | 38445 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| date | 90306.0 | NaN | NaN | NaN | 20165894.915122 | 4697.567567 | 20160801.0 | 20161027.0 | 20170109.0 | 20170421.0 | 20170801.0 |
| visitNumber | 90306.0 | NaN | NaN | NaN | 2.254269 | 9.102378 | 1.0 | 1.0 | 1.0 | 1.0 | 373.0 |
| visitStartTime | 90306.0 | NaN | NaN | NaN | 1485020394.767092 | 9027412.838579 | 1470035170.0 | 1477570329.25 | 1483984659.0 | 1492787453.0 | 1501655878.0 |
| device.browser | 90306 | 28 | Chrome | 62147 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| device.operatingSystem | 90306 | 17 | Windows | 35174 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| device.isMobile | 90306 | 2 | False | 66571 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| device.deviceCategory | 90306 | 3 | desktop | 66572 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| geoNetwork.continent | 90306 | 6 | Americas | 44890 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| geoNetwork.subContinent | 90306 | 23 | Northern America | 38911 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| geoNetwork.country | 90306 | 189 | United States | 36335 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| geoNetwork.region | 90306 | 332 | not available in demo dataset | 50639 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| geoNetwork.metro | 90306 | 86 | not available in demo dataset | 50639 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| geoNetwork.city | 90306 | 542 | not available in demo dataset | 50639 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| geoNetwork.networkDomain | 90306 | 7394 | (not set) | 24320 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| totals.hits | 90306 | 161 | 1 | 44587 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| totals.pageviews | 90296 | 125 | 1 | 45150 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| totals.bounces | 44973 | 1 | 1 | 44973 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| totals.newVisits | 70392 | 1 | 1 | 70392 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| totals.transactionRevenue | 1110 | 820 | 33590000 | 25 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
# Guardamos el describe en un csv para poder trabajar con el
df_description.to_csv('report/df_description.csv')
# Si trabajo en Colab me lo descargo
# files.download('report/df_description.csv')
E.- Distribución de los atributos
Análisis del Target
En un problema de clasificación supervisada, es importante conocer la distribución del target.\ Se trata de un target numérico continuo, con lo que tengo que revisar su distribución.
# Defino el target
target = 'totals.transactionRevenue'
df[target].head()
sessionId 1438082600262726746_1472803483 NaN 1283542838194038522_1472885255 NaN 4339756682310369249_1472828340 NaN 062441254657008214_1472875520 NaN 1381975521299261523_1472829727 NaN Name: totals.transactionRevenue, dtype: object
# Veo los valores no nulos del target para saber cómo son
df[df[target].notnull()][target].head(10)
sessionId 8885051388942907862_1472827393 68030000 3351538799616866750_1472855945 8380000 770431600902969839_1472836071 24080000 1546626226233851002_1472837482 546320000 8516473451212465925_1472845932 339030000 3152246617474456269_1472828860 635500000 1095886019324498043_1472835680 305470000 987390821892515431_1498241858 25000000 0115940065332308119_1498241851 109150000 2665204442613686910_1498232978 13290000 Name: totals.transactionRevenue, dtype: object
df[target].value_counts()
33590000 25
16990000 24
44790000 20
18990000 14
19990000 10
..
8790000 1
300850000 1
16960000 1
26380000 1
41810000 1
Name: totals.transactionRevenue, Length: 820, dtype: int64
df[target].describe()
count 1110 unique 820 top 33590000 freq 25 Name: totals.transactionRevenue, dtype: object
df[target] = df[target].astype(float) # lo pongo como float porque hay decimales
df[target].describe()
count 1.110000e+03 mean 1.287345e+08 std 2.626953e+08 min 2.000000e+05 25% 2.548000e+07 50% 5.118000e+07 75% 1.094425e+08 max 4.198500e+09 Name: totals.transactionRevenue, dtype: float64
# Relleno los nulos con un 0
df[target].fillna(0.0, inplace=True)
df[target].describe()
count 9.030600e+04 mean 1.582345e+06 std 3.238319e+07 min 0.000000e+00 25% 0.000000e+00 50% 0.000000e+00 75% 0.000000e+00 max 4.198500e+09 Name: totals.transactionRevenue, dtype: float64
# Divido entre 10^6
df[target] = df[target] / 1000000
df[target].describe()
count 90306.000000 mean 1.582345 std 32.383189 min 0.000000 25% 0.000000 50% 0.000000 75% 0.000000 max 4198.500000 Name: totals.transactionRevenue, dtype: float64
Vemos que realizan compra muy pocos, por lo que tenemos una distribución muy sesgada hacia el 0. Métricas como la media son muy males para este tipo de distribución (está teniendo en cuenta los ceros).
Vemos el target para aquella parte del dataset en la que sí hay compras
# Target con compras
df[df[target]>0][target].describe()
count 1110.000000 mean 128.734486 std 262.695306 min 0.200000 25% 25.480000 50% 51.180000 75% 109.442500 max 4198.500000 Name: totals.transactionRevenue, dtype: float64
from IPython.display import Image
Image('pictures/sesgos_distribucion.jpg')
Muchas de las cosas que vamos a analizar están relacionadas con compras o sin compras. Creamos una variable que nos indique esto. Después de utilizarla, tendremos que eliminarla sino el algortimo la tendrá en cuenta (y en realidad esta información no existe)
Si queremos hacer un modelo que prediga si alguien compra o no (clasificación binaria), utilizaríamos esta variable como target
# Variable que indica que se ha hecho compra
df['visitWithTransaction'] = (df[target] > 0).astype(int) # La convierto en booleana 0/1
df.head().T
| sessionId | 1438082600262726746_1472803483 | 1283542838194038522_1472885255 | 4339756682310369249_1472828340 | 062441254657008214_1472875520 | 1381975521299261523_1472829727 |
|---|---|---|---|---|---|
| Unnamed: 0 | 13 | 27 | 28 | 37 | 43 |
| channelGrouping | Organic Search | Organic Search | Referral | Organic Search | Organic Search |
| date | 20160902 | 20160902 | 20160902 | 20160902 | 20160902 |
| visitNumber | 1 | 1 | 1 | 1 | 1 |
| visitStartTime | 1472803483 | 1472885255 | 1472828340 | 1472875520 | 1472829727 |
| device.browser | Safari | Safari | Chrome | Chrome | Chrome |
| device.operatingSystem | iOS | Macintosh | Android | Windows | Macintosh |
| device.isMobile | True | False | True | False | False |
| device.deviceCategory | mobile | desktop | mobile | desktop | desktop |
| geoNetwork.continent | Asia | Europe | Asia | Oceania | Asia |
| geoNetwork.subContinent | Southern Asia | Eastern Europe | Southern Asia | Australasia | Eastern Asia |
| geoNetwork.country | Pakistan | Hungary | India | Australia | South Korea |
| geoNetwork.region | Sindh | not available in demo dataset | Karnataka | not available in demo dataset | Seoul |
| geoNetwork.metro | (not set) | not available in demo dataset | (not set) | not available in demo dataset | (not set) |
| geoNetwork.city | Karachi | not available in demo dataset | Bengaluru | not available in demo dataset | Seoul |
| geoNetwork.networkDomain | unknown.unknown | broadband.hu | unknown.unknown | uwa.edu.au | unknown.unknown |
| totals.hits | 1 | 1 | 1 | 1 | 1 |
| totals.pageviews | 1 | 1 | 1 | 1 | 1 |
| totals.bounces | 1 | 1 | 1 | 1 | 1 |
| totals.newVisits | 1 | 1 | 1 | 1 | 1 |
| totals.transactionRevenue | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| visitWithTransaction | 0 | 0 | 0 | 0 | 0 |
df[df['visitWithTransaction']==1][target].describe()
count 1110.000000 mean 128.734486 std 262.695306 min 0.200000 25% 25.480000 50% 51.180000 75% 109.442500 max 4198.500000 Name: totals.transactionRevenue, dtype: float64
# Número de visitas totales
df['visitWithTransaction'].count()
90306
# Número de visitas que terminan en compra
df['visitWithTransaction'].sum()
1110
# Valor medio de las visitas que acaban en compra
df['visitWithTransaction'].mean() * 100
1.2291542090226564
Sólo el 1,3% de las visitas acaban en compra, por lo que tengo un dataset tremendamente desbalanceado (si hiciera un problema de clasificación binaria con esta variable como target, el accuracy sería muy mala métrica).
Para datasets desbalanceadas, al algoritmo le cuesta mucho aprender, por lo que este dataset tendremos que rebalancearlo más adelante.
print('De las {} visitas, hay {} con compras, lo que significa que el {} por ciento de las visitas tienen compra'.format(
df['visitWithTransaction'].count(),
df['visitWithTransaction'].sum(),
df['visitWithTransaction'].mean() * 100
))
De las 90306 visitas, hay 1110 con compras, lo que significa que el 1.2291542090226564 por ciento de las visitas tienen compra
Ploteamos el target para visualizar su distribución
# Distribución de la variable target al completo
plt.figure(figsize=(15, 5))
sns.distplot( # distribuition plot
df[target]
);
Debido a todos los 0 que hay (visitas sin compras), vemos que la distribución está muy tirada a la izquierda.
Ploteamos el target que tiene compras para ver su distribución
Lo podemos hacer con la variable target o con la variable visitWithTransaction
# Distribución con la variable visitWithTransaction
plt.figure(figsize=(15, 5))
sns.distplot(
df[df['visitWithTransaction'] == 1][target],
fit = stats.norm # Pinta la distribucion normal de lo que le estoy pasando --> N(mean=128.734486, std=262.695306)
);
# Distribución con target
plt.figure(figsize=(15, 5))
sns.distplot(
df[df[target] > 0][target],
fit = stats.norm # Pinta la distribucion normal de lo que le estoy pasando --> N(mean=128.734486, std=262.695306)
);
Vemos que la distribución no está tan tirada a la izquierda como antes, pero sigue estando muy desbalanceada. Por lo tanto, el target no tiene una distribución normal.
Si yo quiero utilizar una regresión lineal, por ejemplo, necesito que haya una relación lineal entre el target y el atributo. Lo mejor para asegurarme que voy a tener una relación lineal es forzar al target a que tenga una distribución normal (tendré que cambiar el target y también los atributos). Aplicaremos una transformación logarítmica
df['totals.transactionRevenue'].head()
sessionId 1438082600262726746_1472803483 0.0 1283542838194038522_1472885255 0.0 4339756682310369249_1472828340 0.0 062441254657008214_1472875520 0.0 1381975521299261523_1472829727 0.0 Name: totals.transactionRevenue, dtype: float64
Transformación logaritmica del target
np.log1p??
Call signature: np.log1p(*args, **kwargs) Type: ufunc String form: <ufunc 'log1p'> File: c:\users\jagui\anaconda3\lib\site-packages\numpy\__init__.py Docstring: log1p(x, /, out=None, *, where=True, casting='same_kind', order='K', dtype=None, subok=True[, signature, extobj]) Return the natural logarithm of one plus the input array, element-wise. Calculates ``log(1 + x)``. Parameters ---------- x : array_like Input values. out : ndarray, None, or tuple of ndarray and None, optional A location into which the result is stored. If provided, it must have a shape that the inputs broadcast to. If not provided or None, a freshly-allocated array is returned. A tuple (possible only as a keyword argument) must have length equal to the number of outputs. where : array_like, optional This condition is broadcast over the input. At locations where the condition is True, the `out` array will be set to the ufunc result. Elsewhere, the `out` array will retain its original value. Note that if an uninitialized `out` array is created via the default ``out=None``, locations within it where the condition is False will remain uninitialized. **kwargs For other keyword-only arguments, see the :ref:`ufunc docs <ufuncs.kwargs>`. Returns ------- y : ndarray Natural logarithm of `1 + x`, element-wise. This is a scalar if `x` is a scalar. See Also -------- expm1 : ``exp(x) - 1``, the inverse of `log1p`. Notes ----- For real-valued input, `log1p` is accurate also for `x` so small that `1 + x == 1` in floating-point accuracy. Logarithm is a multivalued function: for each `x` there is an infinite number of `z` such that `exp(z) = 1 + x`. The convention is to return the `z` whose imaginary part lies in `[-pi, pi]`. For real-valued input data types, `log1p` always returns real output. For each value that cannot be expressed as a real number or infinity, it yields ``nan`` and sets the `invalid` floating point error flag. For complex-valued input, `log1p` is a complex analytical function that has a branch cut `[-inf, -1]` and is continuous from above on it. `log1p` handles the floating-point negative zero as an infinitesimal negative number, conforming to the C99 standard. References ---------- .. [1] M. Abramowitz and I.A. Stegun, "Handbook of Mathematical Functions", 10th printing, 1964, pp. 67. http://www.math.sfu.ca/~cbm/aands/ .. [2] Wikipedia, "Logarithm". https://en.wikipedia.org/wiki/Logarithm Examples -------- >>> np.log1p(1e-99) 1e-99 >>> np.log(1 + 1e-99) 0.0 Class docstring: Functions that operate element by element on whole arrays. To see the documentation for a specific ufunc, use `info`. For example, ``np.info(np.sin)``. Because ufuncs are written in C (for speed) and linked into Python with NumPy's ufunc facility, Python's help() function finds this page whenever help() is called on a ufunc. A detailed explanation of ufuncs can be found in the docs for :ref:`ufuncs`. **Calling ufuncs:** ``op(*x[, out], where=True, **kwargs)`` Apply `op` to the arguments `*x` elementwise, broadcasting the arguments. The broadcasting rules are: * Dimensions of length 1 may be prepended to either array. * Arrays may be repeated along dimensions of length 1. Parameters ---------- *x : array_like Input arrays. out : ndarray, None, or tuple of ndarray and None, optional Alternate array object(s) in which to put the result; if provided, it must have a shape that the inputs broadcast to. A tuple of arrays (possible only as a keyword argument) must have length equal to the number of outputs; use None for uninitialized outputs to be allocated by the ufunc. where : array_like, optional This condition is broadcast over the input. At locations where the condition is True, the `out` array will be set to the ufunc result. Elsewhere, the `out` array will retain its original value. Note that if an uninitialized `out` array is created via the default ``out=None``, locations within it where the condition is False will remain uninitialized. **kwargs For other keyword-only arguments, see the :ref:`ufunc docs <ufuncs.kwargs>`. Returns ------- r : ndarray or tuple of ndarray `r` will have the shape that the arrays in `x` broadcast to; if `out` is provided, it will be returned. If not, `r` will be allocated and may contain uninitialized values. If the function has more than one output, then the result will be a tuple of arrays.
# Transformación logarítmica del target
df['totals.transactionRevenue_log'] = np.log1p(df[target]) # aplicamos la función de log en base 10 de (1+X) y no de (X) --> Porque si X=0, el log no existe
df['totals.transactionRevenue_log']
sessionId
1438082600262726746_1472803483 0.0
1283542838194038522_1472885255 0.0
4339756682310369249_1472828340 0.0
062441254657008214_1472875520 0.0
1381975521299261523_1472829727 0.0
...
8316382343226738015_1483549157 0.0
6636384798982309878_1483525429 0.0
469840327005431380_1483573235 0.0
2140149974339316233_1483557808 0.0
5123779100307500332_1483554750 0.0
Name: totals.transactionRevenue_log, Length: 90306, dtype: float64
# Transformación logarítmica del target utilizando una función customizada mediante un apply
df['totals.transactionRevenue_log'] = df[target].apply(lambda x: np.log1p(x)) # Aplico la función a cada valor de la Serie
df['totals.transactionRevenue_log']
sessionId
1438082600262726746_1472803483 0.0
1283542838194038522_1472885255 0.0
4339756682310369249_1472828340 0.0
062441254657008214_1472875520 0.0
1381975521299261523_1472829727 0.0
...
8316382343226738015_1483549157 0.0
6636384798982309878_1483525429 0.0
469840327005431380_1483573235 0.0
2140149974339316233_1483557808 0.0
5123779100307500332_1483554750 0.0
Name: totals.transactionRevenue_log, Length: 90306, dtype: float64
# Podemos también definir fuera la función y luego aplicarle el apply a la serie del target
def my_funcion (x):
y = np.log1p(x)
return y
df['totals.transactionRevenue_log'] = df[target].apply(my_funcion)
df['totals.transactionRevenue_log']
sessionId
1438082600262726746_1472803483 0.0
1283542838194038522_1472885255 0.0
4339756682310369249_1472828340 0.0
062441254657008214_1472875520 0.0
1381975521299261523_1472829727 0.0
...
8316382343226738015_1483549157 0.0
6636384798982309878_1483525429 0.0
469840327005431380_1483573235 0.0
2140149974339316233_1483557808 0.0
5123779100307500332_1483554750 0.0
Name: totals.transactionRevenue_log, Length: 90306, dtype: float64
target_log = 'totals.transactionRevenue_log'
df[df['visitWithTransaction'] == 1][target_log].describe()
count 1110.000000 mean 4.076020 std 1.151894 min 0.182322 25% 3.276390 50% 3.954699 75% 4.704494 max 8.342721 Name: totals.transactionRevenue_log, dtype: float64
# Distribución de la variable target transformada al completo
plt.figure(figsize=(15, 5))
sns.distplot(
df[df['visitWithTransaction'] == 1][target_log],
fit = stats.norm # Pinto una N(mean=4.076020, std=1.151894)
);
Análisis univariante
Es una variable categórica. Comprobamos que efectivamente me la describe como categórica (sino tenemos que cambiarla)
df['device.browser'].describe()
count 90306 unique 28 top Chrome freq 62147 Name: device.browser, dtype: object
df['device.browser'].value_counts()
Chrome 62147 Safari 17999 Firefox 3774 Internet Explorer 1928 Edge 1018 Android Webview 765 Safari (in-app) 668 Opera Mini 601 Opera 578 YaBrowser 231 UC Browser 217 Coc Coc 83 Amazon Silk 66 Android Browser 60 Mozilla Compatible Agent 36 MRCHROME 30 Maxthon 27 Nintendo Browser 24 BlackBerry 21 Nokia Browser 14 Puffin 7 Iron 5 LYF_LS_4002_12 2 IE with Chrome Frame 1 Apple-iPhone7C2 1 SeaMonkey 1 NokiaE52-1 1 (not set) 1 Name: device.browser, dtype: int64
Vamos a agrupar todas aquellas etiquetas que son minoritarias, aplicando la función SetOthers. De momento nos vamos a quedar con las 5 primeras
top_browsers = df['device.browser'].value_counts().head()
top_browsers
Chrome 62147 Safari 17999 Firefox 3774 Internet Explorer 1928 Edge 1018 Name: device.browser, dtype: int64
Meto las etiquetas en una lista
top_browsers_list = top_browsers.index.to_list()
top_browsers_list
['Chrome', 'Safari', 'Firefox', 'Internet Explorer', 'Edge']
La lista de todas las etiquetas serán estas 5 más la agrupación del resto ('Others')
# Añado a la lista el resto de etiquetas agrupadas que llamo 'Others'
top_browsers_list.append('Others')
top_browsers_list
['Chrome', 'Safari', 'Firefox', 'Internet Explorer', 'Edge', 'Others']
pd.Categorical??
Init signature: pd.Categorical( values, categories=None, ordered=None, dtype: 'Dtype | None' = None, fastpath: 'bool' = False, copy: 'bool' = True, ) Source: class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMixin): """ Represent a categorical variable in classic R / S-plus fashion. `Categoricals` can only take on only a limited, and usually fixed, number of possible values (`categories`). In contrast to statistical categorical variables, a `Categorical` might have an order, but numerical operations (additions, divisions, ...) are not possible. All values of the `Categorical` are either in `categories` or `np.nan`. Assigning values outside of `categories` will raise a `ValueError`. Order is defined by the order of the `categories`, not lexical order of the values. Parameters ---------- values : list-like The values of the categorical. If categories are given, values not in categories will be replaced with NaN. categories : Index-like (unique), optional The unique categories for this categorical. If not given, the categories are assumed to be the unique values of `values` (sorted, if possible, otherwise in the order in which they appear). ordered : bool, default False Whether or not this categorical is treated as a ordered categorical. If True, the resulting categorical will be ordered. An ordered categorical respects, when sorted, the order of its `categories` attribute (which in turn is the `categories` argument, if provided). dtype : CategoricalDtype An instance of ``CategoricalDtype`` to use for this categorical. Attributes ---------- categories : Index The categories of this categorical codes : ndarray The codes (integer positions, which point to the categories) of this categorical, read only. ordered : bool Whether or not this Categorical is ordered. dtype : CategoricalDtype The instance of ``CategoricalDtype`` storing the ``categories`` and ``ordered``. Methods ------- from_codes __array__ Raises ------ ValueError If the categories do not validate. TypeError If an explicit ``ordered=True`` is given but no `categories` and the `values` are not sortable. See Also -------- CategoricalDtype : Type for categorical data. CategoricalIndex : An Index with an underlying ``Categorical``. Notes ----- See the `user guide <https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html>`__ for more. Examples -------- >>> pd.Categorical([1, 2, 3, 1, 2, 3]) [1, 2, 3, 1, 2, 3] Categories (3, int64): [1, 2, 3] >>> pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c']) ['a', 'b', 'c', 'a', 'b', 'c'] Categories (3, object): ['a', 'b', 'c'] Missing values are not included as a category. >>> c = pd.Categorical([1, 2, 3, 1, 2, 3, np.nan]) >>> c [1, 2, 3, 1, 2, 3, NaN] Categories (3, int64): [1, 2, 3] However, their presence is indicated in the `codes` attribute by code `-1`. >>> c.codes array([ 0, 1, 2, 0, 1, 2, -1], dtype=int8) Ordered `Categoricals` can be sorted according to the custom order of the categories and can have a min and max value. >>> c = pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'], ordered=True, ... categories=['c', 'b', 'a']) >>> c ['a', 'b', 'c', 'a', 'b', 'c'] Categories (3, object): ['c' < 'b' < 'a'] >>> c.min() 'c' """ # For comparisons, so that numpy uses our implementation if the compare # ops, which raise __array_priority__ = 1000 # tolist is not actually deprecated, just suppressed in the __dir__ _hidden_attrs = PandasObject._hidden_attrs | frozenset(["tolist"]) _typ = "categorical" _dtype: CategoricalDtype def __init__( self, values, categories=None, ordered=None, dtype: Dtype | None = None, fastpath: bool = False, copy: bool = True, ): dtype = CategoricalDtype._from_values_or_dtype( values, categories, ordered, dtype ) # At this point, dtype is always a CategoricalDtype, but # we may have dtype.categories be None, and we need to # infer categories in a factorization step further below if fastpath: codes = coerce_indexer_dtype(values, dtype.categories) dtype = CategoricalDtype(ordered=False).update_dtype(dtype) super().__init__(codes, dtype) return if not is_list_like(values): # GH#38433 warn( "Allowing scalars in the Categorical constructor is deprecated " "and will raise in a future version. Use `[value]` instead", FutureWarning, stacklevel=find_stack_level(), ) values = [values] # null_mask indicates missing values we want to exclude from inference. # This means: only missing values in list-likes (not arrays/ndframes). null_mask = np.array(False) # sanitize input if is_categorical_dtype(values): if dtype.categories is None: dtype = CategoricalDtype(values.categories, dtype.ordered) elif not isinstance(values, (ABCIndex, ABCSeries, ExtensionArray)): values = com.convert_to_list_like(values) if isinstance(values, list) and len(values) == 0: # By convention, empty lists result in object dtype: values = np.array([], dtype=object) elif isinstance(values, np.ndarray): if values.ndim > 1: # preempt sanitize_array from raising ValueError raise NotImplementedError( "> 1 ndim Categorical are not supported at this time" ) values = sanitize_array(values, None) else: # i.e. must be a list arr = sanitize_array(values, None) null_mask = isna(arr) if null_mask.any(): # We remove null values here, then below will re-insert # them, grep "full_codes" arr_list = [values[idx] for idx in np.where(~null_mask)[0]] # GH#44900 Do not cast to float if we have only missing values if arr_list or arr.dtype == "object": sanitize_dtype = None else: sanitize_dtype = arr.dtype arr = sanitize_array(arr_list, None, dtype=sanitize_dtype) values = arr if dtype.categories is None: try: codes, categories = factorize(values, sort=True) except TypeError as err: codes, categories = factorize(values, sort=False) if dtype.ordered: # raise, as we don't have a sortable data structure and so # the user should give us one by specifying categories raise TypeError( "'values' is not ordered, please " "explicitly specify the categories order " "by passing in a categories argument." ) from err # we're inferring from values dtype = CategoricalDtype(categories, dtype.ordered) elif is_categorical_dtype(values.dtype): # error: Item "ExtensionArray" of "Union[Any, ExtensionArray]" has no # attribute "_codes" old_codes = extract_array(values)._codes # type: ignore[union-attr] codes = recode_for_categories( old_codes, values.dtype.categories, dtype.categories, copy=copy ) else: codes = _get_codes_for_values(values, dtype.categories) if null_mask.any(): # Reinsert -1 placeholders for previously removed missing values full_codes = -np.ones(null_mask.shape, dtype=codes.dtype) full_codes[~null_mask] = codes codes = full_codes dtype = CategoricalDtype(ordered=False).update_dtype(dtype) arr = coerce_indexer_dtype(codes, dtype.categories) # error: Argument 1 to "__init__" of "NDArrayBacked" has incompatible # type "Union[ExtensionArray, ndarray]"; expected "ndarray" super().__init__(arr, dtype) # type: ignore[arg-type] @property def dtype(self) -> CategoricalDtype: """ The :class:`~pandas.api.types.CategoricalDtype` for this instance. """ return self._dtype @property def _constructor(self) -> type[Categorical]: return Categorical @classmethod def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False): return Categorical(scalars, dtype=dtype, copy=copy) @overload def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray: ... @overload def astype(self, dtype: ExtensionDtype, copy: bool = ...) -> ExtensionArray: ... @overload def astype(self, dtype: AstypeArg, copy: bool = ...) -> ArrayLike: ... def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: """ Coerce this type to another dtype Parameters ---------- dtype : numpy dtype or pandas type copy : bool, default True By default, astype always returns a newly allocated object. If copy is set to False and dtype is categorical, the original object is returned. """ dtype = pandas_dtype(dtype) if self.dtype is dtype: result = self.copy() if copy else self elif is_categorical_dtype(dtype): dtype = cast("Union[str, CategoricalDtype]", dtype) # GH 10696/18593/18630 dtype = self.dtype.update_dtype(dtype) self = self.copy() if copy else self result = self._set_dtype(dtype) elif isinstance(dtype, ExtensionDtype): return super().astype(dtype, copy=copy) elif is_integer_dtype(dtype) and self.isna().any(): raise ValueError("Cannot convert float NaN to integer") elif len(self.codes) == 0 or len(self.categories) == 0: result = np.array( self, dtype=dtype, copy=copy, ) else: # GH8628 (PERF): astype category codes instead of astyping array new_cats = self.categories._values try: new_cats = new_cats.astype(dtype=dtype, copy=copy) fill_value = self.categories._na_value if not is_valid_na_for_dtype(fill_value, dtype): fill_value = lib.item_from_zerodim( np.array(self.categories._na_value).astype(dtype) ) except ( TypeError, # downstream error msg for CategoricalIndex is misleading ValueError, ): msg = f"Cannot cast {self.categories.dtype} dtype to {dtype}" raise ValueError(msg) result = take_nd( new_cats, ensure_platform_int(self._codes), fill_value=fill_value ) return result @cache_readonly def itemsize(self) -> int: """ return the size of a single category """ return self.categories.itemsize def to_list(self): """ Alias for tolist. """ return self.tolist() @classmethod def _from_inferred_categories( cls, inferred_categories, inferred_codes, dtype, true_values=None ): """ Construct a Categorical from inferred values. For inferred categories (`dtype` is None) the categories are sorted. For explicit `dtype`, the `inferred_categories` are cast to the appropriate type. Parameters ---------- inferred_categories : Index inferred_codes : Index dtype : CategoricalDtype or 'category' true_values : list, optional If none are provided, the default ones are "True", "TRUE", and "true." Returns ------- Categorical """ from pandas import ( Index, to_datetime, to_numeric, to_timedelta, ) cats = Index(inferred_categories) known_categories = ( isinstance(dtype, CategoricalDtype) and dtype.categories is not None ) if known_categories: # Convert to a specialized type with `dtype` if specified. if dtype.categories.is_numeric(): cats = to_numeric(inferred_categories, errors="coerce") elif is_datetime64_dtype(dtype.categories): cats = to_datetime(inferred_categories, errors="coerce") elif is_timedelta64_dtype(dtype.categories): cats = to_timedelta(inferred_categories, errors="coerce") elif dtype.categories.is_boolean(): if true_values is None: true_values = ["True", "TRUE", "true"] # error: Incompatible types in assignment (expression has type # "ndarray", variable has type "Index") cats = cats.isin(true_values) # type: ignore[assignment] if known_categories: # Recode from observation order to dtype.categories order. categories = dtype.categories codes = recode_for_categories(inferred_codes, cats, categories) elif not cats.is_monotonic_increasing: # Sort categories and recode for unknown categories. unsorted = cats.copy() categories = cats.sort_values() codes = recode_for_categories(inferred_codes, unsorted, categories) dtype = CategoricalDtype(categories, ordered=False) else: dtype = CategoricalDtype(cats, ordered=False) codes = inferred_codes return cls(codes, dtype=dtype, fastpath=True) @classmethod def from_codes( cls, codes, categories=None, ordered=None, dtype: Dtype | None = None ): """ Make a Categorical type from codes and categories or dtype. This constructor is useful if you already have codes and categories/dtype and so do not need the (computation intensive) factorization step, which is usually done on the constructor. If your data does not follow this convention, please use the normal constructor. Parameters ---------- codes : array-like of int An integer array, where each integer points to a category in categories or dtype.categories, or else is -1 for NaN. categories : index-like, optional The categories for the categorical. Items need to be unique. If the categories are not given here, then they must be provided in `dtype`. ordered : bool, optional Whether or not this categorical is treated as an ordered categorical. If not given here or in `dtype`, the resulting categorical will be unordered. dtype : CategoricalDtype or "category", optional If :class:`CategoricalDtype`, cannot be used together with `categories` or `ordered`. Returns ------- Categorical Examples -------- >>> dtype = pd.CategoricalDtype(['a', 'b'], ordered=True) >>> pd.Categorical.from_codes(codes=[0, 1, 0, 1], dtype=dtype) ['a', 'b', 'a', 'b'] Categories (2, object): ['a' < 'b'] """ dtype = CategoricalDtype._from_values_or_dtype( categories=categories, ordered=ordered, dtype=dtype ) if dtype.categories is None: msg = ( "The categories must be provided in 'categories' or " "'dtype'. Both were None." ) raise ValueError(msg) if is_extension_array_dtype(codes) and is_integer_dtype(codes): # Avoid the implicit conversion of Int to object if isna(codes).any(): raise ValueError("codes cannot contain NA values") codes = codes.to_numpy(dtype=np.int64) else: codes = np.asarray(codes) if len(codes) and not is_integer_dtype(codes): raise ValueError("codes need to be array-like integers") if len(codes) and (codes.max() >= len(dtype.categories) or codes.min() < -1): raise ValueError("codes need to be between -1 and len(categories)-1") return cls(codes, dtype=dtype, fastpath=True) # ------------------------------------------------------------------ # Categories/Codes/Ordered @property def categories(self): """ The categories of this categorical. Setting assigns new values to each category (effectively a rename of each individual category). The assigned value has to be a list-like object. All items must be unique and the number of items in the new categories must be the same as the number of items in the old categories. Assigning to `categories` is a inplace operation! Raises ------ ValueError If the new categories do not validate as categories or if the number of new categories is unequal the number of old categories See Also -------- rename_categories : Rename categories. reorder_categories : Reorder categories. add_categories : Add new categories. remove_categories : Remove the specified categories. remove_unused_categories : Remove categories which are not used. set_categories : Set the categories to the specified ones. """ return self.dtype.categories @categories.setter def categories(self, categories): new_dtype = CategoricalDtype(categories, ordered=self.ordered) if self.dtype.categories is not None and len(self.dtype.categories) != len( new_dtype.categories ): raise ValueError( "new categories need to have the same number of " "items as the old categories!" ) super().__init__(self._ndarray, new_dtype) @property def ordered(self) -> Ordered: """ Whether the categories have an ordered relationship. """ return self.dtype.ordered @property def codes(self) -> np.ndarray: """ The category codes of this categorical. Codes are an array of integers which are the positions of the actual values in the categories array. There is no setter, use the other categorical methods and the normal item setter to change values in the categorical. Returns ------- ndarray[int] A non-writable view of the `codes` array. """ v = self._codes.view() v.flags.writeable = False return v def _set_categories(self, categories, fastpath=False): """ Sets new categories inplace Parameters ---------- fastpath : bool, default False Don't perform validation of the categories for uniqueness or nulls Examples -------- >>> c = pd.Categorical(['a', 'b']) >>> c ['a', 'b'] Categories (2, object): ['a', 'b'] >>> c._set_categories(pd.Index(['a', 'c'])) >>> c ['a', 'c'] Categories (2, object): ['a', 'c'] """ if fastpath: new_dtype = CategoricalDtype._from_fastpath(categories, self.ordered) else: new_dtype = CategoricalDtype(categories, ordered=self.ordered) if ( not fastpath and self.dtype.categories is not None and len(new_dtype.categories) != len(self.dtype.categories) ): raise ValueError( "new categories need to have the same number of " "items than the old categories!" ) super().__init__(self._ndarray, new_dtype) def _set_dtype(self, dtype: CategoricalDtype) -> Categorical: """ Internal method for directly updating the CategoricalDtype Parameters ---------- dtype : CategoricalDtype Notes ----- We don't do any validation here. It's assumed that the dtype is a (valid) instance of `CategoricalDtype`. """ codes = recode_for_categories(self.codes, self.categories, dtype.categories) return type(self)(codes, dtype=dtype, fastpath=True) def set_ordered(self, value, inplace=False): """ Set the ordered attribute to the boolean value. Parameters ---------- value : bool Set whether this categorical is ordered (True) or not (False). inplace : bool, default False Whether or not to set the ordered attribute in-place or return a copy of this categorical with ordered set to the value. """ inplace = validate_bool_kwarg(inplace, "inplace") new_dtype = CategoricalDtype(self.categories, ordered=value) cat = self if inplace else self.copy() NDArrayBacked.__init__(cat, cat._ndarray, new_dtype) if not inplace: return cat def as_ordered(self, inplace=False): """ Set the Categorical to be ordered. Parameters ---------- inplace : bool, default False Whether or not to set the ordered attribute in-place or return a copy of this categorical with ordered set to True. Returns ------- Categorical or None Ordered Categorical or None if ``inplace=True``. """ inplace = validate_bool_kwarg(inplace, "inplace") return self.set_ordered(True, inplace=inplace) def as_unordered(self, inplace=False): """ Set the Categorical to be unordered. Parameters ---------- inplace : bool, default False Whether or not to set the ordered attribute in-place or return a copy of this categorical with ordered set to False. Returns ------- Categorical or None Unordered Categorical or None if ``inplace=True``. """ inplace = validate_bool_kwarg(inplace, "inplace") return self.set_ordered(False, inplace=inplace) def set_categories( self, new_categories, ordered=None, rename=False, inplace=no_default ): """ Set the categories to the specified new_categories. `new_categories` can include new categories (which will result in unused categories) or remove old categories (which results in values set to NaN). If `rename==True`, the categories will simple be renamed (less or more items than in old categories will result in values set to NaN or in unused categories respectively). This method can be used to perform more than one action of adding, removing, and reordering simultaneously and is therefore faster than performing the individual steps via the more specialised methods. On the other hand this methods does not do checks (e.g., whether the old categories are included in the new categories on a reorder), which can result in surprising changes, for example when using special string dtypes, which does not considers a S1 string equal to a single char python string. Parameters ---------- new_categories : Index-like The categories in new order. ordered : bool, default False Whether or not the categorical is treated as a ordered categorical. If not given, do not change the ordered information. rename : bool, default False Whether or not the new_categories should be considered as a rename of the old categories or as reordered categories. inplace : bool, default False Whether or not to reorder the categories in-place or return a copy of this categorical with reordered categories. .. deprecated:: 1.3.0 Returns ------- Categorical with reordered categories or None if inplace. Raises ------ ValueError If new_categories does not validate as categories See Also -------- rename_categories : Rename categories. reorder_categories : Reorder categories. add_categories : Add new categories. remove_categories : Remove the specified categories. remove_unused_categories : Remove categories which are not used. """ if inplace is not no_default: warn( "The `inplace` parameter in pandas.Categorical." "set_categories is deprecated and will be removed in " "a future version. Removing unused categories will always " "return a new Categorical object.", FutureWarning, stacklevel=2, ) else: inplace = False inplace = validate_bool_kwarg(inplace, "inplace") if ordered is None: ordered = self.dtype.ordered new_dtype = CategoricalDtype(new_categories, ordered=ordered) cat = self if inplace else self.copy() if rename: if cat.dtype.categories is not None and len(new_dtype.categories) < len( cat.dtype.categories ): # remove all _codes which are larger and set to -1/NaN cat._codes[cat._codes >= len(new_dtype.categories)] = -1 codes = cat._codes else: codes = recode_for_categories( cat.codes, cat.categories, new_dtype.categories ) NDArrayBacked.__init__(cat, codes, new_dtype) if not inplace: return cat def rename_categories(self, new_categories, inplace=no_default): """ Rename categories. Parameters ---------- new_categories : list-like, dict-like or callable New categories which will replace old categories. * list-like: all items must be unique and the number of items in the new categories must match the existing number of categories. * dict-like: specifies a mapping from old categories to new. Categories not contained in the mapping are passed through and extra categories in the mapping are ignored. * callable : a callable that is called on all items in the old categories and whose return values comprise the new categories. inplace : bool, default False Whether or not to rename the categories inplace or return a copy of this categorical with renamed categories. .. deprecated:: 1.3.0 Returns ------- cat : Categorical or None Categorical with removed categories or None if ``inplace=True``. Raises ------ ValueError If new categories are list-like and do not have the same number of items than the current categories or do not validate as categories See Also -------- reorder_categories : Reorder categories. add_categories : Add new categories. remove_categories : Remove the specified categories. remove_unused_categories : Remove categories which are not used. set_categories : Set the categories to the specified ones. Examples -------- >>> c = pd.Categorical(['a', 'a', 'b']) >>> c.rename_categories([0, 1]) [0, 0, 1] Categories (2, int64): [0, 1] For dict-like ``new_categories``, extra keys are ignored and categories not in the dictionary are passed through >>> c.rename_categories({'a': 'A', 'c': 'C'}) ['A', 'A', 'b'] Categories (2, object): ['A', 'b'] You may also provide a callable to create the new categories >>> c.rename_categories(lambda x: x.upper()) ['A', 'A', 'B'] Categories (2, object): ['A', 'B'] """ if inplace is not no_default: warn( "The `inplace` parameter in pandas.Categorical." "rename_categories is deprecated and will be removed in " "a future version. Removing unused categories will always " "return a new Categorical object.", FutureWarning, stacklevel=find_stack_level(), ) else: inplace = False inplace = validate_bool_kwarg(inplace, "inplace") cat = self if inplace else self.copy() if is_dict_like(new_categories): cat.categories = [new_categories.get(item, item) for item in cat.categories] elif callable(new_categories): cat.categories = [new_categories(item) for item in cat.categories] else: cat.categories = new_categories if not inplace: return cat def reorder_categories(self, new_categories, ordered=None, inplace=no_default): """ Reorder categories as specified in new_categories. `new_categories` need to include all old categories and no new category items. Parameters ---------- new_categories : Index-like The categories in new order. ordered : bool, optional Whether or not the categorical is treated as a ordered categorical. If not given, do not change the ordered information. inplace : bool, default False Whether or not to reorder the categories inplace or return a copy of this categorical with reordered categories. .. deprecated:: 1.3.0 Returns ------- cat : Categorical or None Categorical with removed categories or None if ``inplace=True``. Raises ------ ValueError If the new categories do not contain all old category items or any new ones See Also -------- rename_categories : Rename categories. add_categories : Add new categories. remove_categories : Remove the specified categories. remove_unused_categories : Remove categories which are not used. set_categories : Set the categories to the specified ones. """ if inplace is not no_default: warn( "The `inplace` parameter in pandas.Categorical." "reorder_categories is deprecated and will be removed in " "a future version. Reordering categories will always " "return a new Categorical object.", FutureWarning, stacklevel=find_stack_level(), ) else: inplace = False inplace = validate_bool_kwarg(inplace, "inplace") if set(self.dtype.categories) != set(new_categories): raise ValueError( "items in new_categories are not the same as in old categories" ) with catch_warnings(): simplefilter("ignore") return self.set_categories(new_categories, ordered=ordered, inplace=inplace) def add_categories(self, new_categories, inplace=no_default): """ Add new categories. `new_categories` will be included at the last/highest place in the categories and will be unused directly after this call. Parameters ---------- new_categories : category or list-like of category The new categories to be included. inplace : bool, default False Whether or not to add the categories inplace or return a copy of this categorical with added categories. .. deprecated:: 1.3.0 Returns ------- cat : Categorical or None Categorical with new categories added or None if ``inplace=True``. Raises ------ ValueError If the new categories include old categories or do not validate as categories See Also -------- rename_categories : Rename categories. reorder_categories : Reorder categories. remove_categories : Remove the specified categories. remove_unused_categories : Remove categories which are not used. set_categories : Set the categories to the specified ones. Examples -------- >>> c = pd.Categorical(['c', 'b', 'c']) >>> c ['c', 'b', 'c'] Categories (2, object): ['b', 'c'] >>> c.add_categories(['d', 'a']) ['c', 'b', 'c'] Categories (4, object): ['b', 'c', 'd', 'a'] """ if inplace is not no_default: warn( "The `inplace` parameter in pandas.Categorical." "add_categories is deprecated and will be removed in " "a future version. Removing unused categories will always " "return a new Categorical object.", FutureWarning, stacklevel=find_stack_level(), ) else: inplace = False inplace = validate_bool_kwarg(inplace, "inplace") if not is_list_like(new_categories): new_categories = [new_categories] already_included = set(new_categories) & set(self.dtype.categories) if len(already_included) != 0: raise ValueError( f"new categories must not include old categories: {already_included}" ) new_categories = list(self.dtype.categories) + list(new_categories) new_dtype = CategoricalDtype(new_categories, self.ordered) cat = self if inplace else self.copy() codes = coerce_indexer_dtype(cat._ndarray, new_dtype.categories) NDArrayBacked.__init__(cat, codes, new_dtype) if not inplace: return cat def remove_categories(self, removals, inplace=no_default): """ Remove the specified categories. `removals` must be included in the old categories. Values which were in the removed categories will be set to NaN Parameters ---------- removals : category or list of categories The categories which should be removed. inplace : bool, default False Whether or not to remove the categories inplace or return a copy of this categorical with removed categories. .. deprecated:: 1.3.0 Returns ------- cat : Categorical or None Categorical with removed categories or None if ``inplace=True``. Raises ------ ValueError If the removals are not contained in the categories See Also -------- rename_categories : Rename categories. reorder_categories : Reorder categories. add_categories : Add new categories. remove_unused_categories : Remove categories which are not used. set_categories : Set the categories to the specified ones. Examples -------- >>> c = pd.Categorical(['a', 'c', 'b', 'c', 'd']) >>> c ['a', 'c', 'b', 'c', 'd'] Categories (4, object): ['a', 'b', 'c', 'd'] >>> c.remove_categories(['d', 'a']) [NaN, 'c', 'b', 'c', NaN] Categories (2, object): ['b', 'c'] """ if inplace is not no_default: warn( "The `inplace` parameter in pandas.Categorical." "remove_categories is deprecated and will be removed in " "a future version. Removing unused categories will always " "return a new Categorical object.", FutureWarning, stacklevel=find_stack_level(), ) else: inplace = False inplace = validate_bool_kwarg(inplace, "inplace") if not is_list_like(removals): removals = [removals] removal_set = set(removals) not_included = removal_set - set(self.dtype.categories) new_categories = [c for c in self.dtype.categories if c not in removal_set] # GH 10156 if any(isna(removals)): not_included = {x for x in not_included if notna(x)} new_categories = [x for x in new_categories if notna(x)] if len(not_included) != 0: raise ValueError(f"removals must all be in old categories: {not_included}") with catch_warnings(): simplefilter("ignore") return self.set_categories( new_categories, ordered=self.ordered, rename=False, inplace=inplace ) def remove_unused_categories(self, inplace=no_default): """ Remove categories which are not used. Parameters ---------- inplace : bool, default False Whether or not to drop unused categories inplace or return a copy of this categorical with unused categories dropped. .. deprecated:: 1.2.0 Returns ------- cat : Categorical or None Categorical with unused categories dropped or None if ``inplace=True``. See Also -------- rename_categories : Rename categories. reorder_categories : Reorder categories. add_categories : Add new categories. remove_categories : Remove the specified categories. set_categories : Set the categories to the specified ones. Examples -------- >>> c = pd.Categorical(['a', 'c', 'b', 'c', 'd']) >>> c ['a', 'c', 'b', 'c', 'd'] Categories (4, object): ['a', 'b', 'c', 'd'] >>> c[2] = 'a' >>> c[4] = 'c' >>> c ['a', 'c', 'a', 'c', 'c'] Categories (4, object): ['a', 'b', 'c', 'd'] >>> c.remove_unused_categories() ['a', 'c', 'a', 'c', 'c'] Categories (2, object): ['a', 'c'] """ if inplace is not no_default: warn( "The `inplace` parameter in pandas.Categorical." "remove_unused_categories is deprecated and " "will be removed in a future version.", FutureWarning, stacklevel=find_stack_level(), ) else: inplace = False inplace = validate_bool_kwarg(inplace, "inplace") cat = self if inplace else self.copy() idx, inv = np.unique(cat._codes, return_inverse=True) if idx.size != 0 and idx[0] == -1: # na sentinel idx, inv = idx[1:], inv - 1 new_categories = cat.dtype.categories.take(idx) new_dtype = CategoricalDtype._from_fastpath( new_categories, ordered=self.ordered ) new_codes = coerce_indexer_dtype(inv, new_dtype.categories) NDArrayBacked.__init__(cat, new_codes, new_dtype) if not inplace: return cat # ------------------------------------------------------------------ def map(self, mapper): """ Map categories using an input mapping or function. Maps the categories to new categories. If the mapping correspondence is one-to-one the result is a :class:`~pandas.Categorical` which has the same order property as the original, otherwise a :class:`~pandas.Index` is returned. NaN values are unaffected. If a `dict` or :class:`~pandas.Series` is used any unmapped category is mapped to `NaN`. Note that if this happens an :class:`~pandas.Index` will be returned. Parameters ---------- mapper : function, dict, or Series Mapping correspondence. Returns ------- pandas.Categorical or pandas.Index Mapped categorical. See Also -------- CategoricalIndex.map : Apply a mapping correspondence on a :class:`~pandas.CategoricalIndex`. Index.map : Apply a mapping correspondence on an :class:`~pandas.Index`. Series.map : Apply a mapping correspondence on a :class:`~pandas.Series`. Series.apply : Apply more complex functions on a :class:`~pandas.Series`. Examples -------- >>> cat = pd.Categorical(['a', 'b', 'c']) >>> cat ['a', 'b', 'c'] Categories (3, object): ['a', 'b', 'c'] >>> cat.map(lambda x: x.upper()) ['A', 'B', 'C'] Categories (3, object): ['A', 'B', 'C'] >>> cat.map({'a': 'first', 'b': 'second', 'c': 'third'}) ['first', 'second', 'third'] Categories (3, object): ['first', 'second', 'third'] If the mapping is one-to-one the ordering of the categories is preserved: >>> cat = pd.Categorical(['a', 'b', 'c'], ordered=True) >>> cat ['a', 'b', 'c'] Categories (3, object): ['a' < 'b' < 'c'] >>> cat.map({'a': 3, 'b': 2, 'c': 1}) [3, 2, 1] Categories (3, int64): [3 < 2 < 1] If the mapping is not one-to-one an :class:`~pandas.Index` is returned: >>> cat.map({'a': 'first', 'b': 'second', 'c': 'first'}) Index(['first', 'second', 'first'], dtype='object') If a `dict` is used, all unmapped categories are mapped to `NaN` and the result is an :class:`~pandas.Index`: >>> cat.map({'a': 'first', 'b': 'second'}) Index(['first', 'second', nan], dtype='object') """ new_categories = self.categories.map(mapper) try: return self.from_codes( self._codes.copy(), categories=new_categories, ordered=self.ordered ) except ValueError: # NA values are represented in self._codes with -1 # np.take causes NA values to take final element in new_categories if np.any(self._codes == -1): new_categories = new_categories.insert(len(new_categories), np.nan) return np.take(new_categories, self._codes) __eq__ = _cat_compare_op(operator.eq) __ne__ = _cat_compare_op(operator.ne) __lt__ = _cat_compare_op(operator.lt) __gt__ = _cat_compare_op(operator.gt) __le__ = _cat_compare_op(operator.le) __ge__ = _cat_compare_op(operator.ge) # ------------------------------------------------------------- # Validators; ideally these can be de-duplicated def _validate_setitem_value(self, value): if not is_hashable(value): # wrap scalars and hashable-listlikes in list return self._validate_listlike(value) else: return self._validate_scalar(value) _validate_searchsorted_value = _validate_setitem_value def _validate_scalar(self, fill_value): """ Convert a user-facing fill_value to a representation to use with our underlying ndarray, raising TypeError if this is not possible. Parameters ---------- fill_value : object Returns ------- fill_value : int Raises ------ TypeError """ if is_valid_na_for_dtype(fill_value, self.categories.dtype): fill_value = -1 elif fill_value in self.categories: fill_value = self._unbox_scalar(fill_value) else: raise TypeError( "Cannot setitem on a Categorical with a new " f"category ({fill_value}), set the categories first" ) return fill_value # ------------------------------------------------------------- @ravel_compat def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: """ The numpy array interface. Returns ------- numpy.array A numpy array of either the specified dtype or, if dtype==None (default), the same dtype as categorical.categories.dtype. """ ret = take_nd(self.categories._values, self._codes) if dtype and not is_dtype_equal(dtype, self.categories.dtype): return np.asarray(ret, dtype) # When we're a Categorical[ExtensionArray], like Interval, # we need to ensure __array__ gets all the way to an # ndarray. return np.asarray(ret) def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): # for binary ops, use our custom dunder methods result = ops.maybe_dispatch_ufunc_to_dunder_op( self, ufunc, method, *inputs, **kwargs ) if result is not NotImplemented: return result if method == "reduce": # e.g. TestCategoricalAnalytics::test_min_max_ordered result = arraylike.dispatch_reduction_ufunc( self, ufunc, method, *inputs, **kwargs ) if result is not NotImplemented: return result # for all other cases, raise for now (similarly as what happens in # Series.__array_prepare__) raise TypeError( f"Object with dtype {self.dtype} cannot perform " f"the numpy op {ufunc.__name__}" ) def __setstate__(self, state): """Necessary for making this object picklable""" if not isinstance(state, dict): return super().__setstate__(state) if "_dtype" not in state: state["_dtype"] = CategoricalDtype(state["_categories"], state["_ordered"]) if "_codes" in state and "_ndarray" not in state: # backward compat, changed what is property vs attribute state["_ndarray"] = state.pop("_codes") super().__setstate__(state) @property def nbytes(self) -> int: return self._codes.nbytes + self.dtype.categories.values.nbytes def memory_usage(self, deep: bool = False) -> int: """ Memory usage of my values Parameters ---------- deep : bool Introspect the data deeply, interrogate `object` dtypes for system-level memory consumption Returns ------- bytes used Notes ----- Memory usage does not include memory consumed by elements that are not components of the array if deep=False See Also -------- numpy.ndarray.nbytes """ return self._codes.nbytes + self.dtype.categories.memory_usage(deep=deep) def isna(self) -> np.ndarray: """ Detect missing values Missing values (-1 in .codes) are detected. Returns ------- np.ndarray[bool] of whether my values are null See Also -------- isna : Top-level isna. isnull : Alias of isna. Categorical.notna : Boolean inverse of Categorical.isna. """ return self._codes == -1 isnull = isna def notna(self) -> np.ndarray: """ Inverse of isna Both missing values (-1 in .codes) and NA as a category are detected as null. Returns ------- np.ndarray[bool] of whether my values are not null See Also -------- notna : Top-level notna. notnull : Alias of notna. Categorical.isna : Boolean inverse of Categorical.notna. """ return ~self.isna() notnull = notna def value_counts(self, dropna: bool = True): """ Return a Series containing counts of each category. Every category will have an entry, even those with a count of 0. Parameters ---------- dropna : bool, default True Don't include counts of NaN. Returns ------- counts : Series See Also -------- Series.value_counts """ from pandas import ( CategoricalIndex, Series, ) code, cat = self._codes, self.categories ncat, mask = (len(cat), code >= 0) ix, clean = np.arange(ncat), mask.all() if dropna or clean: obs = code if clean else code[mask] count = np.bincount(obs, minlength=ncat or 0) else: count = np.bincount(np.where(mask, code, ncat)) ix = np.append(ix, -1) ix = coerce_indexer_dtype(ix, self.dtype.categories) ix = self._from_backing_data(ix) return Series(count, index=CategoricalIndex(ix), dtype="int64") # error: Argument 2 of "_empty" is incompatible with supertype # "NDArrayBackedExtensionArray"; supertype defines the argument type as # "ExtensionDtype" @classmethod def _empty( # type: ignore[override] cls: type_t[Categorical], shape: Shape, dtype: CategoricalDtype ) -> Categorical: """ Analogous to np.empty(shape, dtype=dtype) Parameters ---------- shape : tuple[int] dtype : CategoricalDtype """ arr = cls._from_sequence([], dtype=dtype) # We have to use np.zeros instead of np.empty otherwise the resulting # ndarray may contain codes not supported by this dtype, in which # case repr(result) could segfault. backing = np.zeros(shape, dtype=arr._ndarray.dtype) return arr._from_backing_data(backing) def _internal_get_values(self): """ Return the values. For internal compatibility with pandas formatting. Returns ------- np.ndarray or Index A numpy array of the same dtype as categorical.categories.dtype or Index if datetime / periods. """ # if we are a datetime and period index, return Index to keep metadata if needs_i8_conversion(self.categories.dtype): return self.categories.take(self._codes, fill_value=NaT) elif is_integer_dtype(self.categories) and -1 in self._codes: return self.categories.astype("object").take(self._codes, fill_value=np.nan) return np.array(self) def check_for_ordered(self, op): """assert that we are ordered""" if not self.ordered: raise TypeError( f"Categorical is not ordered for operation {op}\n" "you can use .as_ordered() to change the " "Categorical to an ordered one\n" ) def argsort(self, ascending=True, kind="quicksort", **kwargs): """ Return the indices that would sort the Categorical. .. versionchanged:: 0.25.0 Changed to sort missing values at the end. Parameters ---------- ascending : bool, default True Whether the indices should result in an ascending or descending sort. kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, optional Sorting algorithm. **kwargs: passed through to :func:`numpy.argsort`. Returns ------- np.ndarray[np.intp] See Also -------- numpy.ndarray.argsort Notes ----- While an ordering is applied to the category values, arg-sorting in this context refers more to organizing and grouping together based on matching category values. Thus, this function can be called on an unordered Categorical instance unlike the functions 'Categorical.min' and 'Categorical.max'. Examples -------- >>> pd.Categorical(['b', 'b', 'a', 'c']).argsort() array([2, 0, 1, 3]) >>> cat = pd.Categorical(['b', 'b', 'a', 'c'], ... categories=['c', 'b', 'a'], ... ordered=True) >>> cat.argsort() array([3, 0, 1, 2]) Missing values are placed at the end >>> cat = pd.Categorical([2, None, 1]) >>> cat.argsort() array([2, 0, 1]) """ return super().argsort(ascending=ascending, kind=kind, **kwargs) def sort_values( self, inplace: bool = False, ascending: bool = True, na_position: str = "last" ): """ Sort the Categorical by category value returning a new Categorical by default. While an ordering is applied to the category values, sorting in this context refers more to organizing and grouping together based on matching category values. Thus, this function can be called on an unordered Categorical instance unlike the functions 'Categorical.min' and 'Categorical.max'. Parameters ---------- inplace : bool, default False Do operation in place. ascending : bool, default True Order ascending. Passing False orders descending. The ordering parameter provides the method by which the category values are organized. na_position : {'first', 'last'} (optional, default='last') 'first' puts NaNs at the beginning 'last' puts NaNs at the end Returns ------- Categorical or None See Also -------- Categorical.sort Series.sort_values Examples -------- >>> c = pd.Categorical([1, 2, 2, 1, 5]) >>> c [1, 2, 2, 1, 5] Categories (3, int64): [1, 2, 5] >>> c.sort_values() [1, 1, 2, 2, 5] Categories (3, int64): [1, 2, 5] >>> c.sort_values(ascending=False) [5, 2, 2, 1, 1] Categories (3, int64): [1, 2, 5] Inplace sorting can be done as well: >>> c.sort_values(inplace=True) >>> c [1, 1, 2, 2, 5] Categories (3, int64): [1, 2, 5] >>> >>> c = pd.Categorical([1, 2, 2, 1, 5]) 'sort_values' behaviour with NaNs. Note that 'na_position' is independent of the 'ascending' parameter: >>> c = pd.Categorical([np.nan, 2, 2, np.nan, 5]) >>> c [NaN, 2, 2, NaN, 5] Categories (2, int64): [2, 5] >>> c.sort_values() [2, 2, 5, NaN, NaN] Categories (2, int64): [2, 5] >>> c.sort_values(ascending=False) [5, 2, 2, NaN, NaN] Categories (2, int64): [2, 5] >>> c.sort_values(na_position='first') [NaN, NaN, 2, 2, 5] Categories (2, int64): [2, 5] >>> c.sort_values(ascending=False, na_position='first') [NaN, NaN, 5, 2, 2] Categories (2, int64): [2, 5] """ inplace = validate_bool_kwarg(inplace, "inplace") if na_position not in ["last", "first"]: raise ValueError(f"invalid na_position: {repr(na_position)}") sorted_idx = nargsort(self, ascending=ascending, na_position=na_position) if inplace: self._codes[:] = self._codes[sorted_idx] else: codes = self._codes[sorted_idx] return self._from_backing_data(codes) def _rank( self, *, axis: int = 0, method: str = "average", na_option: str = "keep", ascending: bool = True, pct: bool = False, ): """ See Series.rank.__doc__. """ if axis != 0: raise NotImplementedError vff = self._values_for_rank() return algorithms.rank( vff, axis=axis, method=method, na_option=na_option, ascending=ascending, pct=pct, ) def _values_for_rank(self): """ For correctly ranking ordered categorical data. See GH#15420 Ordered categorical data should be ranked on the basis of codes with -1 translated to NaN. Returns ------- numpy.array """ from pandas import Series if self.ordered: values = self.codes mask = values == -1 if mask.any(): values = values.astype("float64") values[mask] = np.nan elif self.categories.is_numeric(): values = np.array(self) else: # reorder the categories (so rank can use the float codes) # instead of passing an object array to rank values = np.array( self.rename_categories(Series(self.categories).rank().values) ) return values def view(self, dtype=None): if dtype is not None: raise NotImplementedError(dtype) return self._from_backing_data(self._ndarray) def to_dense(self) -> np.ndarray: """ Return my 'dense' representation For internal compatibility with numpy arrays. Returns ------- dense : array """ warn( "Categorical.to_dense is deprecated and will be removed in " "a future version. Use np.asarray(cat) instead.", FutureWarning, stacklevel=find_stack_level(), ) return np.asarray(self) # ------------------------------------------------------------------ # NDArrayBackedExtensionArray compat @property def _codes(self) -> np.ndarray: return self._ndarray @_codes.setter def _codes(self, value: np.ndarray): warn( "Setting the codes on a Categorical is deprecated and will raise in " "a future version. Create a new Categorical object instead", FutureWarning, stacklevel=find_stack_level(), ) # GH#40606 NDArrayBacked.__init__(self, value, self.dtype) def _box_func(self, i: int): if i == -1: return np.NaN return self.categories[i] def _unbox_scalar(self, key) -> int: # searchsorted is very performance sensitive. By converting codes # to same dtype as self.codes, we get much faster performance. code = self.categories.get_loc(key) code = self._ndarray.dtype.type(code) return code # ------------------------------------------------------------------ def take_nd(self, indexer, allow_fill: bool = False, fill_value=None): # GH#27745 deprecate alias that other EAs dont have warn( "Categorical.take_nd is deprecated, use Categorical.take instead", FutureWarning, stacklevel=find_stack_level(), ) return self.take(indexer, allow_fill=allow_fill, fill_value=fill_value) def __iter__(self): """ Returns an Iterator over the values of this Categorical. """ if self.ndim == 1: return iter(self._internal_get_values().tolist()) else: return (self[n] for n in range(len(self))) def __contains__(self, key) -> bool: """ Returns True if `key` is in this Categorical. """ # if key is a NaN, check if any NaN is in self. if is_valid_na_for_dtype(key, self.categories.dtype): return bool(self.isna().any()) return contains(self, key, container=self._codes) # ------------------------------------------------------------------ # Rendering Methods def _formatter(self, boxed: bool = False): # Defer to CategoricalFormatter's formatter. return None def _tidy_repr(self, max_vals: int = 10, footer: bool = True) -> str: """ a short repr displaying only max_vals and an optional (but default footer) """ num = max_vals // 2 head = self[:num]._get_repr(length=False, footer=False) tail = self[-(max_vals - num) :]._get_repr(length=False, footer=False) result = f"{head[:-1]}, ..., {tail[1:]}" if footer: result = f"{result}\n{self._repr_footer()}" return str(result) def _repr_categories(self) -> list[str]: """ return the base repr for the categories """ max_categories = ( 10 if get_option("display.max_categories") == 0 else get_option("display.max_categories") ) from pandas.io.formats import format as fmt format_array = partial( fmt.format_array, formatter=None, quoting=QUOTE_NONNUMERIC ) if len(self.categories) > max_categories: num = max_categories // 2 head = format_array(self.categories[:num]) tail = format_array(self.categories[-num:]) category_strs = head + ["..."] + tail else: category_strs = format_array(self.categories) # Strip all leading spaces, which format_array adds for columns... category_strs = [x.strip() for x in category_strs] return category_strs def _repr_categories_info(self) -> str: """ Returns a string representation of the footer. """ category_strs = self._repr_categories() dtype = str(self.categories.dtype) levheader = f"Categories ({len(self.categories)}, {dtype}): " width, height = get_terminal_size() max_width = get_option("display.width") or width if console.in_ipython_frontend(): # 0 = no breaks max_width = 0 levstring = "" start = True cur_col_len = len(levheader) # header sep_len, sep = (3, " < ") if self.ordered else (2, ", ") linesep = sep.rstrip() + "\n" # remove whitespace for val in category_strs: if max_width != 0 and cur_col_len + sep_len + len(val) > max_width: levstring += linesep + (" " * (len(levheader) + 1)) cur_col_len = len(levheader) + 1 # header + a whitespace elif not start: levstring += sep cur_col_len += len(val) levstring += val start = False # replace to simple save space by return levheader + "[" + levstring.replace(" < ... < ", " ... ") + "]" def _repr_footer(self) -> str: info = self._repr_categories_info() return f"Length: {len(self)}\n{info}" def _get_repr(self, length: bool = True, na_rep="NaN", footer: bool = True) -> str: from pandas.io.formats import format as fmt formatter = fmt.CategoricalFormatter( self, length=length, na_rep=na_rep, footer=footer ) result = formatter.to_string() return str(result) def __repr__(self) -> str: """ String representation. """ _maxlen = 10 if len(self._codes) > _maxlen: result = self._tidy_repr(_maxlen) elif len(self._codes) > 0: result = self._get_repr(length=len(self) > _maxlen) else: msg = self._get_repr(length=False, footer=True).replace("\n", ", ") result = f"[], {msg}" return result # ------------------------------------------------------------------ def _validate_listlike(self, value): # NB: here we assume scalar-like tuples have already been excluded value = extract_array(value, extract_numpy=True) # require identical categories set if isinstance(value, Categorical): if not is_dtype_equal(self.dtype, value.dtype): raise TypeError( "Cannot set a Categorical with another, " "without identical categories" ) # is_dtype_equal implies categories_match_up_to_permutation value = self._encode_with_my_categories(value) return value._codes from pandas import Index # tupleize_cols=False for e.g. test_fillna_iterable_category GH#41914 to_add = Index._with_infer(value, tupleize_cols=False).difference( self.categories ) # no assignments of values not in categories, but it's always ok to set # something to np.nan if len(to_add) and not isna(to_add).all(): raise TypeError( "Cannot setitem on a Categorical with a new " "category, set the categories first" ) codes = self.categories.get_indexer(value) return codes.astype(self._ndarray.dtype, copy=False) def _reverse_indexer(self) -> dict[Hashable, npt.NDArray[np.intp]]: """ Compute the inverse of a categorical, returning a dict of categories -> indexers. *This is an internal function* Returns ------- Dict[Hashable, np.ndarray[np.intp]] dict of categories -> indexers Examples -------- >>> c = pd.Categorical(list('aabca')) >>> c ['a', 'a', 'b', 'c', 'a'] Categories (3, object): ['a', 'b', 'c'] >>> c.categories Index(['a', 'b', 'c'], dtype='object') >>> c.codes array([0, 0, 1, 2, 0], dtype=int8) >>> c._reverse_indexer() {'a': array([0, 1, 4]), 'b': array([2]), 'c': array([3])} """ categories = self.categories r, counts = libalgos.groupsort_indexer( ensure_platform_int(self.codes), categories.size ) counts = ensure_int64(counts).cumsum() _result = (r[start:end] for start, end in zip(counts, counts[1:])) return dict(zip(categories, _result)) # ------------------------------------------------------------------ # Reductions @deprecate_kwarg(old_arg_name="numeric_only", new_arg_name="skipna") def min(self, *, skipna=True, **kwargs): """ The minimum value of the object. Only ordered `Categoricals` have a minimum! .. versionchanged:: 1.0.0 Returns an NA value on empty arrays Raises ------ TypeError If the `Categorical` is not `ordered`. Returns ------- min : the minimum of this `Categorical` """ nv.validate_minmax_axis(kwargs.get("axis", 0)) nv.validate_min((), kwargs) self.check_for_ordered("min") if not len(self._codes): return self.dtype.na_value good = self._codes != -1 if not good.all(): if skipna and good.any(): pointer = self._codes[good].min() else: return np.nan else: pointer = self._codes.min() return self._wrap_reduction_result(None, pointer) @deprecate_kwarg(old_arg_name="numeric_only", new_arg_name="skipna") def max(self, *, skipna=True, **kwargs): """ The maximum value of the object. Only ordered `Categoricals` have a maximum! .. versionchanged:: 1.0.0 Returns an NA value on empty arrays Raises ------ TypeError If the `Categorical` is not `ordered`. Returns ------- max : the maximum of this `Categorical` """ nv.validate_minmax_axis(kwargs.get("axis", 0)) nv.validate_max((), kwargs) self.check_for_ordered("max") if not len(self._codes): return self.dtype.na_value good = self._codes != -1 if not good.all(): if skipna and good.any(): pointer = self._codes[good].max() else: return np.nan else: pointer = self._codes.max() return self._wrap_reduction_result(None, pointer) def mode(self, dropna: bool = True) -> Categorical: """ Returns the mode(s) of the Categorical. Always returns `Categorical` even if only one value. Parameters ---------- dropna : bool, default True Don't consider counts of NaN/NaT. Returns ------- modes : `Categorical` (sorted) """ warn( "Categorical.mode is deprecated and will be removed in a future version. " "Use Series.mode instead.", FutureWarning, stacklevel=find_stack_level(), ) return self._mode(dropna=dropna) def _mode(self, dropna: bool = True) -> Categorical: codes = self._codes if dropna: good = self._codes != -1 codes = self._codes[good] codes = htable.mode(codes, dropna) codes.sort() codes = coerce_indexer_dtype(codes, self.dtype.categories) return self._from_backing_data(codes) # ------------------------------------------------------------------ # ExtensionArray Interface def unique(self): """ Return the ``Categorical`` which ``categories`` and ``codes`` are unique. .. versionchanged:: 1.3.0 Previously, unused categories were dropped from the new categories. Returns ------- Categorical See Also -------- pandas.unique CategoricalIndex.unique Series.unique : Return unique values of Series object. Examples -------- >>> pd.Categorical(list("baabc")).unique() ['b', 'a', 'c'] Categories (3, object): ['a', 'b', 'c'] >>> pd.Categorical(list("baab"), categories=list("abc"), ordered=True).unique() ['b', 'a'] Categories (3, object): ['a' < 'b' < 'c'] """ unique_codes = unique1d(self.codes) return self._from_backing_data(unique_codes) def _values_for_factorize(self): return self._ndarray, -1 @classmethod def _from_factorized(cls, uniques, original): return original._constructor( original.categories.take(uniques), dtype=original.dtype ) def equals(self, other: object) -> bool: """ Returns True if categorical arrays are equal. Parameters ---------- other : `Categorical` Returns ------- bool """ if not isinstance(other, Categorical): return False elif self._categories_match_up_to_permutation(other): other = self._encode_with_my_categories(other) return np.array_equal(self._codes, other._codes) return False @classmethod def _concat_same_type( cls: type[CategoricalT], to_concat: Sequence[CategoricalT], axis: int = 0 ) -> CategoricalT: from pandas.core.dtypes.concat import union_categoricals first = to_concat[0] if axis >= first.ndim: raise ValueError( f"axis {axis} is out of bounds for array of dimension {first.ndim}" ) if axis == 1: # Flatten, concatenate then reshape if not all(x.ndim == 2 for x in to_concat): raise ValueError # pass correctly-shaped to union_categoricals tc_flat = [] for obj in to_concat: tc_flat.extend([obj[:, i] for i in range(obj.shape[1])]) res_flat = cls._concat_same_type(tc_flat, axis=0) result = res_flat.reshape(len(first), -1, order="F") return result result = union_categoricals(to_concat) return result # ------------------------------------------------------------------ def _encode_with_my_categories(self, other: Categorical) -> Categorical: """ Re-encode another categorical using this Categorical's categories. Notes ----- This assumes we have already checked self._categories_match_up_to_permutation(other). """ # Indexing on codes is more efficient if categories are the same, # so we can apply some optimizations based on the degree of # dtype-matching. codes = recode_for_categories( other.codes, other.categories, self.categories, copy=False ) return self._from_backing_data(codes) def _categories_match_up_to_permutation(self, other: Categorical) -> bool: """ Returns True if categoricals are the same dtype same categories, and same ordered Parameters ---------- other : Categorical Returns ------- bool """ return hash(self.dtype) == hash(other.dtype) def is_dtype_equal(self, other) -> bool: warn( "Categorical.is_dtype_equal is deprecated and will be removed " "in a future version", FutureWarning, stacklevel=find_stack_level(), ) try: return self._categories_match_up_to_permutation(other) except (AttributeError, TypeError): return False def describe(self): """ Describes this Categorical Returns ------- description: `DataFrame` A dataframe with frequency and counts by category. """ counts = self.value_counts(dropna=False) freqs = counts / counts.sum() from pandas import Index from pandas.core.reshape.concat import concat result = concat([counts, freqs], axis=1) result.columns = Index(["counts", "freqs"]) result.index.name = "categories" return result def isin(self, values) -> npt.NDArray[np.bool_]: """ Check whether `values` are contained in Categorical. Return a boolean NumPy Array showing whether each element in the Categorical matches an element in the passed sequence of `values` exactly. Parameters ---------- values : set or list-like The sequence of values to test. Passing in a single string will raise a ``TypeError``. Instead, turn a single string into a list of one element. Returns ------- np.ndarray[bool] Raises ------ TypeError * If `values` is not a set or list-like See Also -------- pandas.Series.isin : Equivalent method on Series. Examples -------- >>> s = pd.Categorical(['lama', 'cow', 'lama', 'beetle', 'lama', ... 'hippo']) >>> s.isin(['cow', 'lama']) array([ True, True, True, False, True, False]) Passing a single string as ``s.isin('lama')`` will raise an error. Use a list of one element instead: >>> s.isin(['lama']) array([ True, False, True, False, True, False]) """ if not is_list_like(values): values_type = type(values).__name__ raise TypeError( "only list-like objects are allowed to be passed " f"to isin(), you passed a [{values_type}]" ) values = sanitize_array(values, None, None) null_mask = np.asarray(isna(values)) code_values = self.categories.get_indexer(values) code_values = code_values[null_mask | (code_values >= 0)] return algorithms.isin(self.codes, code_values) def replace(self, to_replace, value, inplace: bool = False): """ Replaces all instances of one value with another Parameters ---------- to_replace: object The value to be replaced value: object The value to replace it with inplace: bool Whether the operation is done in-place Returns ------- None if inplace is True, otherwise the new Categorical after replacement Examples -------- >>> s = pd.Categorical([1, 2, 1, 3]) >>> s.replace(1, 3) [3, 2, 3, 3] Categories (2, int64): [2, 3] """ # GH#44929 deprecation warn( "Categorical.replace is deprecated and will be removed in a future " "version. Use Series.replace directly instead.", FutureWarning, stacklevel=find_stack_level(), ) return self._replace(to_replace=to_replace, value=value, inplace=inplace) def _replace(self, *, to_replace, value, inplace: bool = False): inplace = validate_bool_kwarg(inplace, "inplace") cat = self if inplace else self.copy() # build a dict of (to replace -> value) pairs if is_list_like(to_replace): # if to_replace is list-like and value is scalar replace_dict = {replace_value: value for replace_value in to_replace} else: # if both to_replace and value are scalar replace_dict = {to_replace: value} # other cases, like if both to_replace and value are list-like or if # to_replace is a dict, are handled separately in NDFrame for replace_value, new_value in replace_dict.items(): if new_value == replace_value: continue if replace_value in cat.categories: if isna(new_value): with catch_warnings(): simplefilter("ignore") cat.remove_categories(replace_value, inplace=True) continue categories = cat.categories.tolist() index = categories.index(replace_value) if new_value in cat.categories: value_index = categories.index(new_value) cat._codes[cat._codes == index] = value_index with catch_warnings(): simplefilter("ignore") cat.remove_categories(replace_value, inplace=True) else: categories[index] = new_value with catch_warnings(): simplefilter("ignore") cat.rename_categories(categories, inplace=True) if not inplace: return cat # ------------------------------------------------------------------------ # String methods interface def _str_map( self, f, na_value=np.nan, dtype=np.dtype("object"), convert: bool = True ): # Optimization to apply the callable `f` to the categories once # and rebuild the result by `take`ing from the result with the codes. # Returns the same type as the object-dtype implementation though. from pandas.core.arrays import PandasArray categories = self.categories codes = self.codes result = PandasArray(categories.to_numpy())._str_map(f, na_value, dtype) return take_nd(result, codes, fill_value=na_value) def _str_get_dummies(self, sep="|"): # sep may not be in categories. Just bail on this. from pandas.core.arrays import PandasArray return PandasArray(self.astype(str))._str_get_dummies(sep) File: c:\users\jagui\anaconda3\lib\site-packages\pandas\core\arrays\categorical.py Type: ABCMeta Subclasses: SubclassedCategorical
Vamos a utilizar la variable Categorical (es igual que el object), indicando qué etiquetas queremos meterle.
df['device.browser'] = pd.Categorical(
df['device.browser'], # le paso la serie de la variable que quiero convertir a categórica
categories = top_browsers_list # le paso las únicas etiquetas que quiero que convierta (del total de la variable)
)
df['device.browser']
sessionId
1438082600262726746_1472803483 Safari
1283542838194038522_1472885255 Safari
4339756682310369249_1472828340 Chrome
062441254657008214_1472875520 Chrome
1381975521299261523_1472829727 Chrome
...
8316382343226738015_1483549157 Chrome
6636384798982309878_1483525429 Chrome
469840327005431380_1483573235 Internet Explorer
2140149974339316233_1483557808 Chrome
5123779100307500332_1483554750 Chrome
Name: device.browser, Length: 90306, dtype: category
Categories (6, object): ['Chrome', 'Safari', 'Firefox', 'Internet Explorer', 'Edge', 'Others']
df['device.browser'].value_counts(dropna=False)
Chrome 62147 Safari 17999 Firefox 3774 NaN 3440 Internet Explorer 1928 Edge 1018 Others 0 Name: device.browser, dtype: int64
Las observaciones de las etiquetas que no estaban en la lista, Python me los devuelve como nulos, que serían la suma de observaciones de las etiquetas minoritarias y que corresponden a la etiqueta Others que he creado. Relleno esos nulos con 'Others'.
# Relleno los nulos con 'Others'
df['device.browser'].fillna('Others', inplace=True)
df['device.browser'].describe()
count 90306 unique 6 top Chrome freq 62147 Name: device.browser, dtype: object
df['device.browser'].value_counts(dropna=False)
Chrome 62147 Safari 17999 Firefox 3774 Others 3440 Internet Explorer 1928 Edge 1018 Name: device.browser, dtype: int64
df['device.browser'].value_counts(
normalize = True, # Me devuelve el número de observaciones en porcentaje
dropna = False)
Chrome 0.688182 Safari 0.199311 Firefox 0.041791 Others 0.038093 Internet Explorer 0.021350 Edge 0.011273 Name: device.browser, dtype: float64
Correlación con el target
Hacemos una pivot_table y un countplot para ver qué tipo de relación hay entre ellos. No lo vamos a aplicar a todo el dataframe porque sino tendría en cuenta toda la gente que no ha comprado. Lo aplico sobre la gente que compra. Vamos a ver dos comportamientos:
# Pivot_table de la gente que compra
df[df['visitWithTransaction'] == 1].pivot_table(
index = 'device.browser', # Como índice ponemos la variable que queremos analizar
values = target_log, # Como valores ponemos el target que queremos analizar
aggfunc = [len, np.mean]) # Como funciones de agregación ponemos un conteo y la media
| len | mean | |
|---|---|---|
| totals.transactionRevenue_log | totals.transactionRevenue_log | |
| device.browser | ||
| Chrome | 1009 | 4.141194 |
| Safari | 65 | 3.346190 |
| Firefox | 15 | 3.747523 |
| Internet Explorer | 12 | 3.338823 |
| Edge | 5 | 4.016332 |
| Others | 4 | 3.013546 |
La mayoría de compras hacen a través de Chrome. Por lo tanto, mi modelo va a saber que si no viene de Chrome raramente va a gastar más de cero. Lo más normal es que prediga que esa persona no compra en nuestra web.
# Transacciones por etiqueta y valor medio de la transacción por etiqueta
_results_df = df[df['visitWithTransaction'] == 1].pivot_table(index = 'device.browser', values = target_log, aggfunc = [len, np.mean])
# Renombro las columnas
_results_df.columns = ['transactions', 'mean_revenue_log']
_results_df
| transactions | mean_revenue_log | |
|---|---|---|
| device.browser | ||
| Chrome | 1009 | 4.141194 |
| Safari | 65 | 3.346190 |
| Firefox | 15 | 3.747523 |
| Internet Explorer | 12 | 3.338823 |
| Edge | 5 | 4.016332 |
| Others | 4 | 3.013546 |
# Número de filas en conteo
_results_df['n_rows'] = df['device.browser'].value_counts(dropna = False)
_results_df
| transactions | mean_revenue_log | n_rows | |
|---|---|---|---|
| device.browser | |||
| Chrome | 1009 | 4.141194 | 62147 |
| Safari | 65 | 3.346190 | 17999 |
| Firefox | 15 | 3.747523 | 3774 |
| Internet Explorer | 12 | 3.338823 | 1928 |
| Edge | 5 | 4.016332 | 1018 |
| Others | 4 | 3.013546 | 3440 |
# Número de filas en porcentaje
_results_df['pct_rows'] = df['device.browser'].value_counts(normalize = True, dropna = False)
_results_df
| transactions | mean_revenue_log | n_rows | pct_rows | |
|---|---|---|---|---|
| device.browser | ||||
| Chrome | 1009 | 4.141194 | 62147 | 0.688182 |
| Safari | 65 | 3.346190 | 17999 | 0.199311 |
| Firefox | 15 | 3.747523 | 3774 | 0.041791 |
| Internet Explorer | 12 | 3.338823 | 1928 | 0.021350 |
| Edge | 5 | 4.016332 | 1018 | 0.011273 |
| Others | 4 | 3.013546 | 3440 | 0.038093 |
# Ratio de compradores
_results_df['pct_transactions'] = _results_df['transactions'] / _results_df['n_rows']
_results_df
| transactions | mean_revenue_log | n_rows | pct_rows | pct_transactions | |
|---|---|---|---|---|---|
| device.browser | |||||
| Chrome | 1009 | 4.141194 | 62147 | 0.688182 | 0.016236 |
| Safari | 65 | 3.346190 | 17999 | 0.199311 | 0.003611 |
| Firefox | 15 | 3.747523 | 3774 | 0.041791 | 0.003975 |
| Internet Explorer | 12 | 3.338823 | 1928 | 0.021350 | 0.006224 |
| Edge | 5 | 4.016332 | 1018 | 0.011273 | 0.004912 |
| Others | 4 | 3.013546 | 3440 | 0.038093 | 0.001163 |
# Reordeno las columnas para darle más sentido a la tabla descriptiva
_results_df = _results_df[['n_rows', 'pct_rows', 'transactions', 'pct_transactions', 'mean_revenue_log']]
_results_df
| n_rows | pct_rows | transactions | pct_transactions | mean_revenue_log | |
|---|---|---|---|---|---|
| device.browser | |||||
| Chrome | 62147 | 0.688182 | 1009 | 0.016236 | 4.141194 |
| Safari | 17999 | 0.199311 | 65 | 0.003611 | 3.346190 |
| Firefox | 3774 | 0.041791 | 15 | 0.003975 | 3.747523 |
| Internet Explorer | 1928 | 0.021350 | 12 | 0.006224 | 3.338823 |
| Edge | 1018 | 0.011273 | 5 | 0.004912 | 4.016332 |
| Others | 3440 | 0.038093 | 4 | 0.001163 | 3.013546 |
También podemos visualizar la correlación haciendo uso de un countplot
df['device.browser'].value_counts()
Chrome 62147 Safari 17999 Firefox 3774 Others 3440 Internet Explorer 1928 Edge 1018 Name: device.browser, dtype: int64
df['device.browser'].info()
<class 'pandas.core.series.Series'> Index: 90306 entries, 1438082600262726746_1472803483 to 5123779100307500332_1483554750 Series name: device.browser Non-Null Count Dtype -------------- ----- 90306 non-null category dtypes: category(1) memory usage: 2.8+ MB
sns.countplot??
Signature: sns.countplot( data=None, *, x=None, y=None, hue=None, order=None, hue_order=None, orient=None, color=None, palette=None, saturation=0.75, width=0.8, dodge=True, ax=None, **kwargs, ) Docstring: Show the counts of observations in each categorical bin using bars. A count plot can be thought of as a histogram across a categorical, instead of quantitative, variable. The basic API and options are identical to those for :func:`barplot`, so you can compare counts across nested variables. Note that the newer :func:`histplot` function offers more functionality, although its default behavior is somewhat different. .. note:: This function always treats one of the variables as categorical and draws data at ordinal positions (0, 1, ... n) on the relevant axis, even when the data has a numeric or date type. See the :ref:`tutorial <categorical_tutorial>` for more information. Parameters ---------- data : DataFrame, array, or list of arrays, optional Dataset for plotting. If ``x`` and ``y`` are absent, this is interpreted as wide-form. Otherwise it is expected to be long-form. x, y, hue : names of variables in ``data`` or vector data, optional Inputs for plotting long-form data. See examples for interpretation. order, hue_order : lists of strings, optional Order to plot the categorical levels in; otherwise the levels are inferred from the data objects. orient : "v" | "h", optional Orientation of the plot (vertical or horizontal). This is usually inferred based on the type of the input variables, but it can be used to resolve ambiguity when both `x` and `y` are numeric or when plotting wide-form data. color : matplotlib color, optional Single color for the elements in the plot. palette : palette name, list, or dict Colors to use for the different levels of the ``hue`` variable. Should be something that can be interpreted by :func:`color_palette`, or a dictionary mapping hue levels to matplotlib colors. saturation : float, optional Proportion of the original saturation to draw colors at. Large patches often look better with slightly desaturated colors, but set this to `1` if you want the plot colors to perfectly match the input color. dodge : bool, optional When hue nesting is used, whether elements should be shifted along the categorical axis. ax : matplotlib Axes, optional Axes object to draw the plot onto, otherwise uses the current Axes. kwargs : key, value mappings Other keyword arguments are passed through to :meth:`matplotlib.axes.Axes.bar`. Returns ------- ax : matplotlib Axes Returns the Axes object with the plot drawn onto it. See Also -------- barplot : Show point estimates and confidence intervals using bars. catplot : Combine a categorical plot with a :class:`FacetGrid`. Examples -------- .. include:: ../docstrings/countplot.rst Source: def countplot( data=None, *, x=None, y=None, hue=None, order=None, hue_order=None, orient=None, color=None, palette=None, saturation=.75, width=.8, dodge=True, ax=None, **kwargs ): estimator = "size" errorbar = None n_boot = 0 units = None seed = None errcolor = None errwidth = None capsize = None if x is None and y is not None: orient = "h" x = y elif y is None and x is not None: orient = "v" y = x elif x is not None and y is not None: raise ValueError("Cannot pass values for both `x` and `y`") plotter = _CountPlotter( x, y, hue, data, order, hue_order, estimator, errorbar, n_boot, units, seed, orient, color, palette, saturation, width, errcolor, errwidth, capsize, dodge ) plotter.value_label = "count" if ax is None: ax = plt.gca() plotter.plot(ax, kwargs) return ax File: c:\users\jagui\anaconda3\lib\site-packages\seaborn\categorical.py Type: function
# Número de filas por etiqueta
plt.figure(figsize = (15,5))
sns.countplot(
data = df,
x = 'device.browser' # Le damos la variable que queremos analizar
);
sns.boxenplot??
Signature: sns.boxenplot( data=None, *, x=None, y=None, hue=None, order=None, hue_order=None, orient=None, color=None, palette=None, saturation=0.75, width=0.8, dodge=True, k_depth='tukey', linewidth=None, scale='exponential', outlier_prop=0.007, trust_alpha=0.05, showfliers=True, ax=None, box_kws=None, flier_kws=None, line_kws=None, ) Docstring: Draw an enhanced box plot for larger datasets. This style of plot was originally named a "letter value" plot because it shows a large number of quantiles that are defined as "letter values". It is similar to a box plot in plotting a nonparametric representation of a distribution in which all features correspond to actual observations. By plotting more quantiles, it provides more information about the shape of the distribution, particularly in the tails. For a more extensive explanation, you can read the paper that introduced the plot: https://vita.had.co.nz/papers/letter-value-plot.html .. note:: This function always treats one of the variables as categorical and draws data at ordinal positions (0, 1, ... n) on the relevant axis, even when the data has a numeric or date type. See the :ref:`tutorial <categorical_tutorial>` for more information. Parameters ---------- data : DataFrame, array, or list of arrays, optional Dataset for plotting. If ``x`` and ``y`` are absent, this is interpreted as wide-form. Otherwise it is expected to be long-form. x, y, hue : names of variables in ``data`` or vector data, optional Inputs for plotting long-form data. See examples for interpretation. order, hue_order : lists of strings, optional Order to plot the categorical levels in; otherwise the levels are inferred from the data objects. orient : "v" | "h", optional Orientation of the plot (vertical or horizontal). This is usually inferred based on the type of the input variables, but it can be used to resolve ambiguity when both `x` and `y` are numeric or when plotting wide-form data. color : matplotlib color, optional Single color for the elements in the plot. palette : palette name, list, or dict Colors to use for the different levels of the ``hue`` variable. Should be something that can be interpreted by :func:`color_palette`, or a dictionary mapping hue levels to matplotlib colors. saturation : float, optional Proportion of the original saturation to draw colors at. Large patches often look better with slightly desaturated colors, but set this to `1` if you want the plot colors to perfectly match the input color. width : float, optional Width of a full element when not using hue nesting, or width of all the elements for one level of the major grouping variable. dodge : bool, optional When hue nesting is used, whether elements should be shifted along the categorical axis. k_depth : {"tukey", "proportion", "trustworthy", "full"} or scalar The number of boxes, and by extension number of percentiles, to draw. All methods are detailed in Wickham's paper. Each makes different assumptions about the number of outliers and leverages different statistical properties. If "proportion", draw no more than `outlier_prop` extreme observations. If "full", draw `log(n)+1` boxes. linewidth : float, optional Width of the gray lines that frame the plot elements. scale : {"exponential", "linear", "area"}, optional Method to use for the width of the letter value boxes. All give similar results visually. "linear" reduces the width by a constant linear factor, "exponential" uses the proportion of data not covered, "area" is proportional to the percentage of data covered. outlier_prop : float, optional Proportion of data believed to be outliers. Must be in the range (0, 1]. Used to determine the number of boxes to plot when `k_depth="proportion"`. trust_alpha : float, optional Confidence level for a box to be plotted. Used to determine the number of boxes to plot when `k_depth="trustworthy"`. Must be in the range (0, 1). showfliers : bool, optional If False, suppress the plotting of outliers. ax : matplotlib Axes, optional Axes object to draw the plot onto, otherwise uses the current Axes. box_kws: dict, optional Keyword arguments for the box artists; passed to :class:`matplotlib.patches.Rectangle`. line_kws: dict, optional Keyword arguments for the line denoting the median; passed to :meth:`matplotlib.axes.Axes.plot`. flier_kws: dict, optional Keyword arguments for the scatter denoting the outlier observations; passed to :meth:`matplotlib.axes.Axes.scatter`. Returns ------- ax : matplotlib Axes Returns the Axes object with the plot drawn onto it. See Also -------- violinplot : A combination of boxplot and kernel density estimation. boxplot : A traditional box-and-whisker plot with a similar API. catplot : Combine a categorical plot with a :class:`FacetGrid`. Examples -------- .. include:: ../docstrings/boxenplot.rst Source: def boxenplot( data=None, *, x=None, y=None, hue=None, order=None, hue_order=None, orient=None, color=None, palette=None, saturation=.75, width=.8, dodge=True, k_depth='tukey', linewidth=None, scale='exponential', outlier_prop=0.007, trust_alpha=0.05, showfliers=True, ax=None, box_kws=None, flier_kws=None, line_kws=None, ): plotter = _LVPlotter(x, y, hue, data, order, hue_order, orient, color, palette, saturation, width, dodge, k_depth, linewidth, scale, outlier_prop, trust_alpha, showfliers) if ax is None: ax = plt.gca() plotter.plot(ax, box_kws, flier_kws, line_kws) return ax File: c:\users\jagui\anaconda3\lib\site-packages\seaborn\categorical.py Type: function
Habíamos visto como se comporta el target en conjunto. Ahora quiero ver como se comporta con respecto a cada etiqueta de la variable. Para ello utilizamos un boxenplot y vemos la distribución del target. Y eso lo hacemos sólo sobre las sesiones en las que hay compra porque quiero ver en qué etiquetas se gasta más y en cuales se gasta menos.
# Transacción media por etiqueta de aquellos que compran
plt.figure(figsize=(15,5))
sns.boxenplot(
data = df[df['visitWithTransaction'] == 1], # le paso los datos de aquellas sesiones en las que se realizan compras
x = 'device.browser', # le paso la variable categórica que quiero analizar
y = target_log # Le paso el target
);
En el boxenplot tenemos el percentil 25, el 50 y el 75 al igual que en un boxplot. Pero aquí nos da una distribución más exacta. Cada caja por encima del percentil 75 tiene un ancho de la mitad de la caja inferior. Y cada caja por debajo del percentil 25 tiene un ancho de la mitad de la caja superior.
Definimos funciones con todos los pasos que hemos dado para automatizarlos con el resto de variables
def plot_cat_values(dataframe, column, target_column):
'''
Grafica la relación entre las variable categóricas (column) y el target (target_column),
pasándole un dataframe, mediante 2 gráficos.
'''
plt.figure(figsize=(15,8))
# Divido el gráfico en 2 filas y 1 columna
# El primer gráfico lo pinto en el primer hueco
ax1 = plt.subplot(2,1,1)
## Graficamos el conteo de cada uno de los valores
ax1 = sns.countplot(
data = dataframe,
x = column, # Le damos la variable que queremos analizar
order = list(dataframe[column].unique()) # Le imponemos el orden en que quiero que grafique las etiquetas
)
# El segundo gráfico lo pinto en el segundo hueco
ax2 = plt.subplot(2,1,2)
## Graficamos la distribución del target sólo para aquellos casos con target > 0 (para que no se chafe el BP)
ax2 = sns.boxenplot(
data = dataframe[dataframe[target_column] > 0], # En vez de poner visitWithTransaction, para generalizarlo pongo target_column > 0
x = column,
y = target_column,
order = list(dataframe[column].unique()) # Le imponemos el mismo orden que en el gráfico 1
)
plt.show() # No hay un return ya que no me tiene que devolver algo que guarda. Directamente le digo que muestre el gráfico
# Comprobamos que funciona bien
plot_cat_values(dataframe = df, column = 'device.browser', target_column = target_log);
def explore_cat_values(dataframe, column, target_column):
'''
Tabla descriptiva del analisis de las variables categóricas (column) frente al target (target_column),
pasándole el dataframe.
'''
# Pivot_table
_results_df = dataframe[dataframe[target_column] > 0].pivot_table(
index = column, # etiquetas de la variable que quiero analizar
values = target_column, # Target
aggfunc = [len, np.mean] # Funciones de agregación de conteo y media del valor de compra para cada etiqueta
)
# Renombro las columnas del pivot_table
_results_df.columns = ['transactions', 'mean_revenue_log']
# Número de filas por etiqueta de la variable que quiero analizar
_results_df['n_rows'] = dataframe[column].value_counts(dropna = False)
# Porcentaje del número de filas por etiqueta de la variable que quiero analizar
_results_df['pct_rows'] = dataframe[column].value_counts(normalize = True, dropna = False)
# Porcentaje de filas con transacciones por etiqueta de la variable que quiero analizar
_results_df['pct_transactions'] = _results_df['transactions'] / _results_df['n_rows']
# Reordeno las columnas de la tabla
_results_df = _results_df[['n_rows', 'pct_rows', 'transactions', 'pct_transactions', 'mean_revenue_log']]
return _results_df
# Compruebo que funciona bien
explore_cat_values(dataframe = df, column = 'device.browser', target_column = target_log)
| n_rows | pct_rows | transactions | pct_transactions | mean_revenue_log | |
|---|---|---|---|---|---|
| device.browser | |||||
| Chrome | 62147 | 0.688182 | 1009 | 0.016236 | 4.141194 |
| Safari | 17999 | 0.199311 | 65 | 0.003611 | 3.346190 |
| Firefox | 3774 | 0.041791 | 15 | 0.003975 | 3.747523 |
| Internet Explorer | 1928 | 0.021350 | 12 | 0.006224 | 3.338823 |
| Edge | 1018 | 0.011273 | 5 | 0.004912 | 4.016332 |
| Others | 3440 | 0.038093 | 4 | 0.001163 | 3.013546 |
def setOthers(dataframe, column, num_values):
'''
Reduce el número de etiquetas.
Agrupa las etiquetas de la variable categórica (column) que quedan fuera de las num_values primeras,
en una única llamada 'Others', pasándole el dataframe.
'''
# Me quedo con la lista de las primeras etiquetas (num_values)
top_categories = dataframe[column].value_counts().head(num_values)
top_categories_list = top_categories.index.to_list()
# Añado a la lista la etiqueta 'Others'
top_categories_list.append('Others')
# Convierto a categórica sólo las etiquetas que le indico, de la variable (column)
# Las otras etiquetas que no están en la lista, se convierten en nulos
dataframe[column] = pd.Categorical(
dataframe[column],
categories = top_categories_list
)
# Relleno los nulos con 'Others' y me devuelve las etiquetas de la variable (column)
return dataframe[column].fillna('Others') # No es recomendable meter el inplace = True en el fillna. Es mejor guardar el resultado en una variable
Device.browser es una variable categórica, que hemos reducido de etiquetas y aplicaremos un OHE.
Como no se el número de etiquetas que tengo en total, empiezo haciendo un value_counts
df['device.operatingSystem'].value_counts()
Windows 35174 Macintosh 25276 Android 12284 iOS 10732 Linux 3543 Chrome OS 2641 (not set) 444 Windows Phone 118 Samsung 28 BlackBerry 25 Nintendo Wii 16 Firefox OS 10 Nintendo WiiU 7 Xbox 5 Nintendo 3DS 1 Nokia 1 FreeBSD 1 Name: device.operatingSystem, dtype: int64
Data Preparation --> Reducción de etiquetas
df['device.operatingSystem'] = setOthers(dataframe = df, column = 'device.operatingSystem', num_values = 6)
df['device.operatingSystem'].value_counts()
Windows 35174 Macintosh 25276 Android 12284 iOS 10732 Linux 3543 Chrome OS 2641 Others 656 Name: device.operatingSystem, dtype: int64
Data Understanding --> Análisis de la variable frente al target
plot_cat_values(dataframe = df, column = 'device.operatingSystem', target_column = target_log)
explore_cat_values(dataframe = df, column = 'device.operatingSystem', target_column = target_log)
| n_rows | pct_rows | transactions | pct_transactions | mean_revenue_log | |
|---|---|---|---|---|---|
| device.operatingSystem | |||||
| Windows | 35174 | 0.389498 | 204 | 0.005800 | 4.033411 |
| Macintosh | 25276 | 0.279893 | 627 | 0.024806 | 4.154472 |
| Android | 12284 | 0.136026 | 43 | 0.003500 | 3.693958 |
| iOS | 10732 | 0.118840 | 47 | 0.004379 | 3.409389 |
| Linux | 3543 | 0.039233 | 85 | 0.023991 | 3.771755 |
| Chrome OS | 2641 | 0.029245 | 104 | 0.039379 | 4.394535 |
| Others | 656 | 0.007264 | 0 | 0.000000 | NaN |
Device.operatingsystem es una variable categórica, que hemos reducido de etiquetas y aplicaremos un OHE.
explore_cat_values(dataframe = df, column = 'device.isMobile', target_column = target_log)
| n_rows | pct_rows | transactions | pct_transactions | mean_revenue_log | |
|---|---|---|---|---|---|
| device.isMobile | |||||
| False | 66571 | 0.737171 | 1017 | 0.015277 | 4.124387 |
| True | 23735 | 0.262829 | 93 | 0.003918 | 3.547096 |
Se observa que en Desktop se hace compra en un mayor porcentaje de ocasiones y el importe es más alto
# Vemos qué tipo de variable es
df['device.isMobile'].dtype
dtype('bool')
Podemos dejar la variable como booleana, ya que en principio mo debería darnos problemas. Si cuando entrenemos el algoritmo peta es porque el algoritmo que estamos utilizando no permite usar booleanos y deberemos cambiarla a integer.
# La convertimos a integer
df['device.isMobile'] = df['device.isMobile'].astype(int)
df['device.isMobile'].dtype
dtype('int32')
explore_cat_values(dataframe = df, column = 'device.deviceCategory', target_column = target_log)
| n_rows | pct_rows | transactions | pct_transactions | mean_revenue_log | |
|---|---|---|---|---|---|
| device.deviceCategory | |||||
| desktop | 66572 | 0.737182 | 1017 | 0.015277 | 4.124387 |
| mobile | 20733 | 0.229586 | 69 | 0.003328 | 3.542712 |
| tablet | 3001 | 0.033231 | 24 | 0.007997 | 3.559700 |
Vemos la relación que hay entre dos categóricas ('ismobile' y 'devicecategory') y para ello hacemos lo mismo que se hace para determinar la Confusion Matrix, utilizar el crosstab. La idea es ver si estas dos variables me están diciendo información parecida o no y también quiero saber si la etiqueta "tablet" estaba antes, dentro de moviles o no moviles.
# Relación entre dos categóricas
pd.crosstab(
df['device.isMobile'],
df['device.deviceCategory']
)
| device.deviceCategory | desktop | mobile | tablet |
|---|---|---|---|
| device.isMobile | |||
| 0 | 66558 | 12 | 1 |
| 1 | 14 | 20721 | 3000 |
Tenemos tres opciones:
Si usamos un decisiontree dejaría las dos, porque es un algoritmo robusto a variables correlacionadas. Si usamos una regresión lineal, no es tan robusto con variables correlacionadas (predice bien pero no explica bien la importancia de cada una), por lo que me quedaría con una de ellas.
Se trata de ir probando las tres opciones y ver con cuál se comporta mejor el algoritmo.
explore_cat_values(dataframe = df, column = 'channelGrouping', target_column = target_log)
| n_rows | pct_rows | transactions | pct_transactions | mean_revenue_log | |
|---|---|---|---|---|---|
| channelGrouping | |||||
| Affiliates | 1648 | 0.018249 | 1 | 0.000607 | 2.193886 |
| Direct | 14061 | 0.155704 | 214 | 0.015219 | 4.398677 |
| Display | 570 | 0.006312 | 10 | 0.017544 | 4.226333 |
| Organic Search | 38445 | 0.425719 | 326 | 0.008480 | 3.800753 |
| Paid Search | 2507 | 0.027761 | 39 | 0.015556 | 3.852479 |
| Referral | 10501 | 0.116282 | 511 | 0.048662 | 4.137267 |
| Social | 22563 | 0.249851 | 9 | 0.000399 | 3.907991 |
Muestra el canal por el cual entra el usuario:
Las etiquetas más interesantes es la de referral que tiene la tasa de conversión de compra más alta con diferencia. Affiliates, Display y Social no son representativas ya que tienen un número de transacciones muy pequeño en este sample (quizás habría que utilizar todo el datset para saber qué comportamiento tienen).
Por otro lado, vemos que Affiliates sólo tiene una transacción. Si utilizamos un decision tree, va a decirdir que todos los que ingresen por este canal no van a hacer compra. Recordemos que los decisiontree discriminan tanto los que tienen muchos 1 como los que tienen muchos 0.
plot_cat_values(dataframe = df, column = 'channelGrouping', target_column = target_log)
Es una variable categórica. Haremos un OHE.
df['date'].describe()
count 9.030600e+04 mean 2.016589e+07 std 4.697568e+03 min 2.016080e+07 25% 2.016103e+07 50% 2.017011e+07 75% 2.017042e+07 max 2.017080e+07 Name: date, dtype: float64
df['date'].head()
sessionId 1438082600262726746_1472803483 20160902 1283542838194038522_1472885255 20160902 4339756682310369249_1472828340 20160902 062441254657008214_1472875520 20160902 1381975521299261523_1472829727 20160902 Name: date, dtype: int64
Está en formato numérico --> ano_mes_día
Para trabajar en SQL es más facil con este formato
Tenemos que cambiarla a un formato datetime para que Python entienda lo que es año, mes y día
pd.to_datetime(
df['date']
)
sessionId
1438082600262726746_1472803483 1970-01-01 00:00:00.020160902
1283542838194038522_1472885255 1970-01-01 00:00:00.020160902
4339756682310369249_1472828340 1970-01-01 00:00:00.020160902
062441254657008214_1472875520 1970-01-01 00:00:00.020160902
1381975521299261523_1472829727 1970-01-01 00:00:00.020160902
...
8316382343226738015_1483549157 1970-01-01 00:00:00.020170104
6636384798982309878_1483525429 1970-01-01 00:00:00.020170104
469840327005431380_1483573235 1970-01-01 00:00:00.020170104
2140149974339316233_1483557808 1970-01-01 00:00:00.020170104
5123779100307500332_1483554750 1970-01-01 00:00:00.020170104
Name: date, Length: 90306, dtype: datetime64[ns]
Tenemos que pasarle un formato al datetime indicando cómo está expresado el date, sino lo convierte sin sentido ya que es un número que no está separado por guiones año-mes-dia (por defecto python es el formato que lee). Siempre que no venga en este formato tendremos que especificar a python con el format
# La pasamos a datetime
df['date'] = pd.to_datetime(
df['date'], # variable que quiero cambiar
format='%Y%m%d' # formato que le tengo que decir (le digo que las 4 primeras posiciones son año, las dos siguientes son el mes y las dos)
)
df['date'].head()
sessionId 1438082600262726746_1472803483 2016-09-02 1283542838194038522_1472885255 2016-09-02 4339756682310369249_1472828340 2016-09-02 062441254657008214_1472875520 2016-09-02 1381975521299261523_1472829727 2016-09-02 Name: date, dtype: datetime64[ns]
# Sacamos toda la información al date
# En un ecommerce la variable date es muy importante (no es lo mismo comprar en invierno que en verano)
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['monthDay'] = df['date'].dt.day
df['weekDay'] = df['date'].dt.weekday
df['quarter'] = df['date'].dt.quarter
df['week'] = df['date'].dt.week
df.head(5).T
| sessionId | 1438082600262726746_1472803483 | 1283542838194038522_1472885255 | 4339756682310369249_1472828340 | 062441254657008214_1472875520 | 1381975521299261523_1472829727 |
|---|---|---|---|---|---|
| Unnamed: 0 | 13 | 27 | 28 | 37 | 43 |
| channelGrouping | Organic Search | Organic Search | Referral | Organic Search | Organic Search |
| date | 2016-09-02 00:00:00 | 2016-09-02 00:00:00 | 2016-09-02 00:00:00 | 2016-09-02 00:00:00 | 2016-09-02 00:00:00 |
| visitNumber | 1 | 1 | 1 | 1 | 1 |
| visitStartTime | 1472803483 | 1472885255 | 1472828340 | 1472875520 | 1472829727 |
| device.browser | Safari | Safari | Chrome | Chrome | Chrome |
| device.operatingSystem | iOS | Macintosh | Android | Windows | Macintosh |
| device.isMobile | 1 | 0 | 1 | 0 | 0 |
| device.deviceCategory | mobile | desktop | mobile | desktop | desktop |
| geoNetwork.continent | Asia | Europe | Asia | Oceania | Asia |
| geoNetwork.subContinent | Southern Asia | Eastern Europe | Southern Asia | Australasia | Eastern Asia |
| geoNetwork.country | Pakistan | Hungary | India | Australia | South Korea |
| geoNetwork.region | Sindh | not available in demo dataset | Karnataka | not available in demo dataset | Seoul |
| geoNetwork.metro | (not set) | not available in demo dataset | (not set) | not available in demo dataset | (not set) |
| geoNetwork.city | Karachi | not available in demo dataset | Bengaluru | not available in demo dataset | Seoul |
| geoNetwork.networkDomain | unknown.unknown | broadband.hu | unknown.unknown | uwa.edu.au | unknown.unknown |
| totals.hits | 1 | 1 | 1 | 1 | 1 |
| totals.pageviews | 1 | 1 | 1 | 1 | 1 |
| totals.bounces | 1 | 1 | 1 | 1 | 1 |
| totals.newVisits | 1 | 1 | 1 | 1 | 1 |
| totals.transactionRevenue | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| visitWithTransaction | 0 | 0 | 0 | 0 | 0 |
| totals.transactionRevenue_log | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| year | 2016 | 2016 | 2016 | 2016 | 2016 |
| month | 9 | 9 | 9 | 9 | 9 |
| monthDay | 2 | 2 | 2 | 2 | 2 |
| weekDay | 4 | 4 | 4 | 4 | 4 |
| quarter | 3 | 3 | 3 | 3 | 3 |
| week | 35 | 35 | 35 | 35 | 35 |
# Extraida toda la información, eliminamos la variable
df.drop('date', axis=1, inplace=True)
Es un timestamp es un valor que cuenta los segundos desde un tiempo concreto. Por lo tanto va a ser una variable numérica, lo vemos
df['visitStartTime'].describe()
count 9.030600e+04 mean 1.485020e+09 std 9.027413e+06 min 1.470035e+09 25% 1.477570e+09 50% 1.483985e+09 75% 1.492787e+09 max 1.501656e+09 Name: visitStartTime, dtype: float64
df['visitStartTime'].head()
sessionId 1438082600262726746_1472803483 1472803483 1283542838194038522_1472885255 1472885255 4339756682310369249_1472828340 1472828340 062441254657008214_1472875520 1472875520 1381975521299261523_1472829727 1472829727 Name: visitStartTime, dtype: int64
Vemos que la mayoría son muy parecidos. Tenemos que convertir estos valores. Para eso utilizamos un timestamp converter para convertirla a una fecha real, por ejemplo: https://www.epochconverter.com/
Por ejemplo para un timestamp de 1472803483, la fecha será el 2 de septiembre de 2016 10:04:43 GMT+02:00 DST
Para el algoritmo la hora seguro que es relevante, pero el minuto igual no. Saco simplemente la hora
# Me devuelve el año, mes, día, hora, minuto y segundo
datetime.fromtimestamp(1472803483)
datetime.datetime(2016, 9, 2, 10, 4, 43)
# Me devuelve la hora
datetime.fromtimestamp(1472803483).hour
10
Creamos una variable que me saque las horas para todas las filas. Para ello aplicamos el apply (hace como un for loop a cada una de las filas, aplicando la función que yo defina)
# Extraigo la hora del timestamp
df['visitHour'] = df['visitStartTime'].apply(lambda x: datetime.fromtimestamp(x).hour)
df['visitHour']
sessionId
1438082600262726746_1472803483 10
1283542838194038522_1472885255 8
4339756682310369249_1472828340 16
062441254657008214_1472875520 6
1381975521299261523_1472829727 17
..
8316382343226738015_1483549157 17
6636384798982309878_1483525429 11
469840327005431380_1483573235 0
2140149974339316233_1483557808 20
5123779100307500332_1483554750 19
Name: visitHour, Length: 90306, dtype: int64
# Extraida la información, ya puedo eliminar la variable porque ya tenemos el año, mes, dia que sacamos del date
df.drop('visitStartTime', axis=1, inplace=True)
df.head().T
| sessionId | 1438082600262726746_1472803483 | 1283542838194038522_1472885255 | 4339756682310369249_1472828340 | 062441254657008214_1472875520 | 1381975521299261523_1472829727 |
|---|---|---|---|---|---|
| Unnamed: 0 | 13 | 27 | 28 | 37 | 43 |
| channelGrouping | Organic Search | Organic Search | Referral | Organic Search | Organic Search |
| visitNumber | 1 | 1 | 1 | 1 | 1 |
| device.browser | Safari | Safari | Chrome | Chrome | Chrome |
| device.operatingSystem | iOS | Macintosh | Android | Windows | Macintosh |
| device.isMobile | 1 | 0 | 1 | 0 | 0 |
| device.deviceCategory | mobile | desktop | mobile | desktop | desktop |
| geoNetwork.continent | Asia | Europe | Asia | Oceania | Asia |
| geoNetwork.subContinent | Southern Asia | Eastern Europe | Southern Asia | Australasia | Eastern Asia |
| geoNetwork.country | Pakistan | Hungary | India | Australia | South Korea |
| geoNetwork.region | Sindh | not available in demo dataset | Karnataka | not available in demo dataset | Seoul |
| geoNetwork.metro | (not set) | not available in demo dataset | (not set) | not available in demo dataset | (not set) |
| geoNetwork.city | Karachi | not available in demo dataset | Bengaluru | not available in demo dataset | Seoul |
| geoNetwork.networkDomain | unknown.unknown | broadband.hu | unknown.unknown | uwa.edu.au | unknown.unknown |
| totals.hits | 1 | 1 | 1 | 1 | 1 |
| totals.pageviews | 1 | 1 | 1 | 1 | 1 |
| totals.bounces | 1 | 1 | 1 | 1 | 1 |
| totals.newVisits | 1 | 1 | 1 | 1 | 1 |
| totals.transactionRevenue | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| visitWithTransaction | 0 | 0 | 0 | 0 | 0 |
| totals.transactionRevenue_log | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| year | 2016 | 2016 | 2016 | 2016 | 2016 |
| month | 9 | 9 | 9 | 9 | 9 |
| monthDay | 2 | 2 | 2 | 2 | 2 |
| weekDay | 4 | 4 | 4 | 4 | 4 |
| quarter | 3 | 3 | 3 | 3 | 3 |
| week | 35 | 35 | 35 | 35 | 35 |
| visitHour | 10 | 8 | 16 | 6 | 17 |
df['visitNumber'].head()
sessionId 1438082600262726746_1472803483 1 1283542838194038522_1472885255 1 4339756682310369249_1472828340 1 062441254657008214_1472875520 1 1381975521299261523_1472829727 1 Name: visitNumber, dtype: int64
df['visitNumber'].describe()
count 90306.000000 mean 2.254269 std 9.102378 min 1.000000 25% 1.000000 50% 1.000000 75% 1.000000 max 373.000000 Name: visitNumber, dtype: float64
Relación con el target
Recordemos que si estamos en una regresión lineal, la relación tiene que ser lineal, sino el algoritmo no funcionaría bien
Para ello haremos una visualización utilizando un regplot (es un scatterplot más una regresión lineal por encima)
sns.regplot??
Signature: sns.regplot( data=None, *, x=None, y=None, x_estimator=None, x_bins=None, x_ci='ci', scatter=True, fit_reg=True, ci=95, n_boot=1000, units=None, seed=None, order=1, logistic=False, lowess=False, robust=False, logx=False, x_partial=None, y_partial=None, truncate=True, dropna=True, x_jitter=None, y_jitter=None, label=None, color=None, marker='o', scatter_kws=None, line_kws=None, ax=None, ) Docstring: Plot data and a linear regression model fit. There are a number of mutually exclusive options for estimating the regression model. See the :ref:`tutorial <regression_tutorial>` for more information. Parameters ---------- x, y: string, series, or vector array Input variables. If strings, these should correspond with column names in ``data``. When pandas objects are used, axes will be labeled with the series name. data : DataFrame Tidy ("long-form") dataframe where each column is a variable and each row is an observation. x_estimator : callable that maps vector -> scalar, optional Apply this function to each unique value of ``x`` and plot the resulting estimate. This is useful when ``x`` is a discrete variable. If ``x_ci`` is given, this estimate will be bootstrapped and a confidence interval will be drawn. x_bins : int or vector, optional Bin the ``x`` variable into discrete bins and then estimate the central tendency and a confidence interval. This binning only influences how the scatterplot is drawn; the regression is still fit to the original data. This parameter is interpreted either as the number of evenly-sized (not necessary spaced) bins or the positions of the bin centers. When this parameter is used, it implies that the default of ``x_estimator`` is ``numpy.mean``. x_ci : "ci", "sd", int in [0, 100] or None, optional Size of the confidence interval used when plotting a central tendency for discrete values of ``x``. If ``"ci"``, defer to the value of the ``ci`` parameter. If ``"sd"``, skip bootstrapping and show the standard deviation of the observations in each bin. scatter : bool, optional If ``True``, draw a scatterplot with the underlying observations (or the ``x_estimator`` values). fit_reg : bool, optional If ``True``, estimate and plot a regression model relating the ``x`` and ``y`` variables. ci : int in [0, 100] or None, optional Size of the confidence interval for the regression estimate. This will be drawn using translucent bands around the regression line. The confidence interval is estimated using a bootstrap; for large datasets, it may be advisable to avoid that computation by setting this parameter to None. n_boot : int, optional Number of bootstrap resamples used to estimate the ``ci``. The default value attempts to balance time and stability; you may want to increase this value for "final" versions of plots. units : variable name in ``data``, optional If the ``x`` and ``y`` observations are nested within sampling units, those can be specified here. This will be taken into account when computing the confidence intervals by performing a multilevel bootstrap that resamples both units and observations (within unit). This does not otherwise influence how the regression is estimated or drawn. seed : int, numpy.random.Generator, or numpy.random.RandomState, optional Seed or random number generator for reproducible bootstrapping. order : int, optional If ``order`` is greater than 1, use ``numpy.polyfit`` to estimate a polynomial regression. logistic : bool, optional If ``True``, assume that ``y`` is a binary variable and use ``statsmodels`` to estimate a logistic regression model. Note that this is substantially more computationally intensive than linear regression, so you may wish to decrease the number of bootstrap resamples (``n_boot``) or set ``ci`` to None. lowess : bool, optional If ``True``, use ``statsmodels`` to estimate a nonparametric lowess model (locally weighted linear regression). Note that confidence intervals cannot currently be drawn for this kind of model. robust : bool, optional If ``True``, use ``statsmodels`` to estimate a robust regression. This will de-weight outliers. Note that this is substantially more computationally intensive than standard linear regression, so you may wish to decrease the number of bootstrap resamples (``n_boot``) or set ``ci`` to None. logx : bool, optional If ``True``, estimate a linear regression of the form y ~ log(x), but plot the scatterplot and regression model in the input space. Note that ``x`` must be positive for this to work. {x,y}_partial : strings in ``data`` or matrices Confounding variables to regress out of the ``x`` or ``y`` variables before plotting. truncate : bool, optional If ``True``, the regression line is bounded by the data limits. If ``False``, it extends to the ``x`` axis limits. {x,y}_jitter : floats, optional Add uniform random noise of this size to either the ``x`` or ``y`` variables. The noise is added to a copy of the data after fitting the regression, and only influences the look of the scatterplot. This can be helpful when plotting variables that take discrete values. label : string Label to apply to either the scatterplot or regression line (if ``scatter`` is ``False``) for use in a legend. color : matplotlib color Color to apply to all plot elements; will be superseded by colors passed in ``scatter_kws`` or ``line_kws``. marker : matplotlib marker code Marker to use for the scatterplot glyphs. {scatter,line}_kws : dictionaries Additional keyword arguments to pass to ``plt.scatter`` and ``plt.plot``. ax : matplotlib Axes, optional Axes object to draw the plot onto, otherwise uses the current Axes. Returns ------- ax : matplotlib Axes The Axes object containing the plot. See Also -------- lmplot : Combine :func:`regplot` and :class:`FacetGrid` to plot multiple linear relationships in a dataset. jointplot : Combine :func:`regplot` and :class:`JointGrid` (when used with ``kind="reg"``). pairplot : Combine :func:`regplot` and :class:`PairGrid` (when used with ``kind="reg"``). residplot : Plot the residuals of a linear regression model. Notes ----- The :func:`regplot` and :func:`lmplot` functions are closely related, but the former is an axes-level function while the latter is a figure-level function that combines :func:`regplot` and :class:`FacetGrid`. It's also easy to combine :func:`regplot` and :class:`JointGrid` or :class:`PairGrid` through the :func:`jointplot` and :func:`pairplot` functions, although these do not directly accept all of :func:`regplot`'s parameters. Examples -------- .. include: ../docstrings/regplot.rst Source: def regplot( data=None, *, x=None, y=None, x_estimator=None, x_bins=None, x_ci="ci", scatter=True, fit_reg=True, ci=95, n_boot=1000, units=None, seed=None, order=1, logistic=False, lowess=False, robust=False, logx=False, x_partial=None, y_partial=None, truncate=True, dropna=True, x_jitter=None, y_jitter=None, label=None, color=None, marker="o", scatter_kws=None, line_kws=None, ax=None ): plotter = _RegressionPlotter(x, y, data, x_estimator, x_bins, x_ci, scatter, fit_reg, ci, n_boot, units, seed, order, logistic, lowess, robust, logx, x_partial, y_partial, truncate, dropna, x_jitter, y_jitter, color, label) if ax is None: ax = plt.gca() scatter_kws = {} if scatter_kws is None else copy.copy(scatter_kws) scatter_kws["marker"] = marker line_kws = {} if line_kws is None else copy.copy(line_kws) plotter.plot(ax, scatter_kws, line_kws) return ax File: c:\users\jagui\anaconda3\lib\site-packages\seaborn\regression.py Type: function
plt.figure(figsize=(15,5))
sns.regplot(
data = df[df['visitWithTransaction'] == 1], # Le pasamos sólo los datos de los que hacen compra (para no tener un montón de 0 que no me interesan tanto)
x = 'visitNumber', # Le pasamos la variable que queremos analizar
y = target_log # Le pasamos el target
);
No tiene pinta de que haya una correlación entre ambas variables. Lo vemos haciendo la correlación de Pearson (es la que hace por defecto)
df[['visitNumber', target_log]].head()
| visitNumber | totals.transactionRevenue_log | |
|---|---|---|
| sessionId | ||
| 1438082600262726746_1472803483 | 1 | 0.0 |
| 1283542838194038522_1472885255 | 1 | 0.0 |
| 4339756682310369249_1472828340 | 1 | 0.0 |
| 062441254657008214_1472875520 | 1 | 0.0 |
| 1381975521299261523_1472829727 | 1 | 0.0 |
# Correlación de Pearson
df[['visitNumber', target_log]].corr()
| visitNumber | totals.transactionRevenue_log | |
|---|---|---|
| visitNumber | 1.000000 | 0.020139 |
| totals.transactionRevenue_log | 0.020139 | 1.000000 |
Efectivamente, la correlación lineal entre ambas es practicamente nula.
Si transformamos las dos variables a una distribución normal es mucho más probable que consiga capturar relaciones lineales, si es que las hay.
# Transformamos a una distribución normal
df['visitNumber_log'] = df['visitNumber'].apply(lambda x: np.log1p(x)) # aplico a la variable el log de base 10 de (1+x)
df['visitNumber_log']
sessionId
1438082600262726746_1472803483 0.693147
1283542838194038522_1472885255 0.693147
4339756682310369249_1472828340 0.693147
062441254657008214_1472875520 0.693147
1381975521299261523_1472829727 0.693147
...
8316382343226738015_1483549157 0.693147
6636384798982309878_1483525429 0.693147
469840327005431380_1483573235 0.693147
2140149974339316233_1483557808 0.693147
5123779100307500332_1483554750 0.693147
Name: visitNumber_log, Length: 90306, dtype: float64
# Otra forma de hacerlo
np.log1p(df['visitNumber'])
sessionId
1438082600262726746_1472803483 0.693147
1283542838194038522_1472885255 0.693147
4339756682310369249_1472828340 0.693147
062441254657008214_1472875520 0.693147
1381975521299261523_1472829727 0.693147
...
8316382343226738015_1483549157 0.693147
6636384798982309878_1483525429 0.693147
469840327005431380_1483573235 0.693147
2140149974339316233_1483557808 0.693147
5123779100307500332_1483554750 0.693147
Name: visitNumber, Length: 90306, dtype: float64
# Graficamos las dos variables normalizadas
plt.figure(figsize=(15,5))
sns.regplot(
data = df[df['visitWithTransaction'] == 1], # Le pasamos sólo los datos de los que hacen compra (para no tener un montón de 0 que no me interesan tanto)
x = 'visitNumber_log', # Le pasamos la variable que queremos analizar
y = target_log # Le pasamos el target
);
Las observaciones de la parte baja del visitNumber que antes estaban muy concentrados, los separa y los de la parte alta que antes estaban muy dispersos, los junta. De esta forma, la distribución queda un poco más homogenea.
df[['visitNumber_log', target_log]].corr()
| visitNumber_log | totals.transactionRevenue_log | |
|---|---|---|
| visitNumber_log | 1.000000 | 0.089364 |
| totals.transactionRevenue_log | 0.089364 | 1.000000 |
Si finalmente utilizamos una regresión lineal, usaremos la variable normalizada visitNumber_log porque si captura algún tipo de relación lineal la va a capturar con esta variable y no con visitNumber.
Es el número de clicks al carrito que hago
df['totals.hits'].head()
sessionId 1438082600262726746_1472803483 1 1283542838194038522_1472885255 1 4339756682310369249_1472828340 1 062441254657008214_1472875520 1 1381975521299261523_1472829727 1 Name: totals.hits, dtype: object
Está como object. Tenemos que cambiarla a numérica
# Convierto a integer
df['totals.hits'] = df['totals.hits'].astype(int) # Como no hay cliks que sea un número decimal, la dejo como int
df['totals.hits'].describe()
count 90306.000000 mean 4.596251 std 9.670030 min 1.000000 25% 1.000000 50% 2.000000 75% 4.000000 max 500.000000 Name: totals.hits, dtype: float64
plt.figure(figsize=(15,5))
sns.regplot(
data = df[df['visitWithTransaction'] == 1], # Le pasamos los datos que tienen compras
x = 'totals.hits', # Le paso la variable que quiero analizar
y = target_log # Le paso el target
);
# Normalizamos la variable
df['totals.hits_log'] = df['totals.hits'].apply(lambda x: np.log1p(x))
plt.figure(figsize=(15,5))
sns.regplot(
data = df[df['visitWithTransaction'] == 1],
x = 'totals.hits_log',
y = target_log
);
# Correlación de Pearson
df[['totals.hits_log', target_log]].corr()
| totals.hits_log | totals.transactionRevenue_log | |
|---|---|---|
| totals.hits_log | 1.000000 | 0.299851 |
| totals.transactionRevenue_log | 0.299851 | 1.000000 |
df.pivot_table(index='totals.hits', values='visitWithTransaction', aggfunc=[len, np.sum, np.mean]).head(30)
| len | sum | mean | |
|---|---|---|---|
| visitWithTransaction | visitWithTransaction | visitWithTransaction | |
| totals.hits | |||
| 1 | 44587 | 0 | 0.000000 |
| 2 | 13939 | 0 | 0.000000 |
| 3 | 6961 | 0 | 0.000000 |
| 4 | 4209 | 0 | 0.000000 |
| 5 | 3071 | 2 | 0.000651 |
| 6 | 2402 | 1 | 0.000416 |
| 7 | 1932 | 2 | 0.001035 |
| 8 | 1486 | 4 | 0.002692 |
| 9 | 1265 | 7 | 0.005534 |
| 10 | 1068 | 10 | 0.009363 |
| 11 | 960 | 21 | 0.021875 |
| 12 | 797 | 29 | 0.036386 |
| 13 | 735 | 32 | 0.043537 |
| 14 | 646 | 41 | 0.063467 |
| 15 | 546 | 33 | 0.060440 |
| 16 | 458 | 31 | 0.067686 |
| 17 | 428 | 37 | 0.086449 |
| 18 | 367 | 32 | 0.087193 |
| 19 | 345 | 26 | 0.075362 |
| 20 | 298 | 34 | 0.114094 |
| 21 | 282 | 35 | 0.124113 |
| 22 | 237 | 22 | 0.092827 |
| 23 | 239 | 28 | 0.117155 |
| 24 | 203 | 28 | 0.137931 |
| 25 | 196 | 29 | 0.147959 |
| 26 | 176 | 25 | 0.142045 |
| 27 | 144 | 24 | 0.166667 |
| 28 | 166 | 30 | 0.180723 |
| 29 | 127 | 23 | 0.181102 |
| 30 | 145 | 32 | 0.220690 |
En general:
Número de páginas que visita en nuestra web en cada sesión
df['totals.pageviews'].head(5)
sessionId 1438082600262726746_1472803483 1 1283542838194038522_1472885255 1 4339756682310369249_1472828340 1 062441254657008214_1472875520 1 1381975521299261523_1472829727 1 Name: totals.pageviews, dtype: object
df['totals.pageviews'].describe()
count 90296 unique 125 top 1 freq 45150 Name: totals.pageviews, dtype: object
df['totals.pageviews'].isnull().sum()
10
¿Cómo podemos imputar los nulos?
# Imputamos los nulos con un 0
# Valor extremo porque quiero diferenciarlo de los no nulos
# Valor no muy extremo porque igual utilizo una regresión lineal
df['totals.pageviews'].fillna(0, inplace=True)
# Convertimos a integer
df['totals.pageviews'] = df['totals.pageviews'].astype(int)
df['totals.pageviews'].describe()
count 90306.000000 mean 3.849124 std 7.096109 min 0.000000 25% 1.000000 50% 1.000000 75% 4.000000 max 469.000000 Name: totals.pageviews, dtype: float64
Vemos una distribución muy parecida a las variables anteriores
# Visualizamos el pageviews
plt.figure(figsize=(15,5))
sns.regplot(
data = df[df['visitWithTransaction'] == 1],
x = 'totals.pageviews',
y = target_log
);
# Correlación de Pearson
df[['totals.pageviews', target_log]].corr()
| totals.pageviews | totals.transactionRevenue_log | |
|---|---|---|
| totals.pageviews | 1.000000 | 0.395228 |
| totals.transactionRevenue_log | 0.395228 | 1.000000 |
# Normalizamos el pageviews para que tenga la misma distribución que el target
df['totals.pageviews_log'] = df['totals.pageviews'].apply(lambda x: np.log1p(x))
# Visualizamos el totals.pageviews normalizado
plt.figure(figsize=(15,5))
sns.regplot(
data = df[df['visitWithTransaction'] == 1],
x = 'totals.pageviews_log',
y = target_log
);
# Correlación de Pearson
df[['totals.pageviews_log', target_log]].corr()
| totals.pageviews_log | totals.transactionRevenue_log | |
|---|---|---|
| totals.pageviews_log | 1.000000 | 0.306681 |
| totals.transactionRevenue_log | 0.306681 | 1.000000 |
Podemos analizar la correlación entre pageviews y Hits
# Correlación de Pearson
df[['totals.pageviews', 'totals.hits']].corr()
| totals.pageviews | totals.hits | |
|---|---|---|
| totals.pageviews | 1.000000 | 0.982462 |
| totals.hits | 0.982462 | 1.000000 |
# Correlación de Pearson
df[['totals.pageviews_log', 'totals.hits_log']].corr()
| totals.pageviews_log | totals.hits_log | |
|---|---|---|
| totals.pageviews_log | 1.00000 | 0.99126 |
| totals.hits_log | 0.99126 | 1.00000 |
Vemos que la correlación es altisima:
La eliminamos por si utilizo una regresión lineal
# Eliminamos las variables que tienen correlación alta con hits
df.drop(['totals.pageviews', 'totals.pageviews_log'], axis = 1, inplace = True)
df['totals.bounces'].describe()
count 44973 unique 1 top 1 freq 44973 Name: totals.bounces, dtype: object
df['totals.newVisits'].describe()
count 70392 unique 1 top 1 freq 70392 Name: totals.newVisits, dtype: object
Ambas son booleanas 1/0. Las dejo de lado.
df.describe(include = 'object').T
| count | unique | top | freq | |
|---|---|---|---|---|
| channelGrouping | 90306 | 8 | Organic Search | 38445 |
| device.deviceCategory | 90306 | 3 | desktop | 66572 |
| geoNetwork.continent | 90306 | 6 | Americas | 44890 |
| geoNetwork.subContinent | 90306 | 23 | Northern America | 38911 |
| geoNetwork.country | 90306 | 189 | United States | 36335 |
| geoNetwork.region | 90306 | 332 | not available in demo dataset | 50639 |
| geoNetwork.metro | 90306 | 86 | not available in demo dataset | 50639 |
| geoNetwork.city | 90306 | 542 | not available in demo dataset | 50639 |
| geoNetwork.networkDomain | 90306 | 7394 | (not set) | 24320 |
| totals.bounces | 44973 | 1 | 1 | 44973 |
| totals.newVisits | 70392 | 1 | 1 | 70392 |
explore_cat_values(dataframe = df, column = 'geoNetwork.continent', target_column = target_log)
| n_rows | pct_rows | transactions | pct_transactions | mean_revenue_log | |
|---|---|---|---|---|---|
| geoNetwork.continent | |||||
| Americas | 44890 | 0.497088 | 1097 | 0.024438 | 4.078602 |
| Asia | 22359 | 0.247592 | 2 | 0.000089 | 3.012438 |
| Europe | 19910 | 0.220473 | 9 | 0.000452 | 3.941313 |
| Oceania | 1556 | 0.017230 | 2 | 0.001285 | 4.329520 |
OJO: Habíamos visto que esta variable tenía 6 etiquetas y aquí sólo aparecen 4. Eso es debido a que la función explore_cat_values sólo nos devuelve las compras. Las sesiones que no tienen compras no aparecen. para verlas utilizamos la otra función con la que las pñodemos visualizar
plot_cat_values(dataframe = df, column = 'geoNetwork.continent', target_column = target_log)
Aquí podemos ver las dos etiquetas en las que no hay ninguna compra (Africa y not_set)
Esta es una variable muy relevante, porque el algoritmo va a poder diferenciar entre América y el resto. Y de lo que es el resto, va a predecir que no hacen compras ( tan sólo hay 13 transacciones con un ratio de transacción muy bajo).
Haremos un OHE
explore_cat_values(dataframe = df, column = 'geoNetwork.subContinent', target_column = target_log)
| n_rows | pct_rows | transactions | pct_transactions | mean_revenue_log | |
|---|---|---|---|---|---|
| geoNetwork.subContinent | |||||
| Australasia | 1537 | 0.017020 | 2 | 0.001301 | 4.329520 |
| Caribbean | 248 | 0.002746 | 3 | 0.012097 | 4.114201 |
| Central America | 1553 | 0.017197 | 3 | 0.001932 | 4.357528 |
| Eastern Asia | 4716 | 0.052222 | 1 | 0.000212 | 2.889816 |
| Eastern Europe | 4536 | 0.050229 | 1 | 0.000220 | 4.343286 |
| Northern America | 38911 | 0.430879 | 1082 | 0.027807 | 4.078029 |
| Northern Europe | 5919 | 0.065544 | 6 | 0.001014 | 3.801762 |
| South America | 4178 | 0.046265 | 9 | 0.002154 | 4.042635 |
| Southern Europe | 3595 | 0.039809 | 1 | 0.000278 | 4.654912 |
| Western Asia | 3872 | 0.042876 | 1 | 0.000258 | 3.135059 |
| Western Europe | 5860 | 0.064890 | 1 | 0.000171 | 3.663049 |
plot_cat_values(dataframe = df, column = 'geoNetwork.subContinent', target_column = target_log)
Finalmente aplicaremos OHE
results_by_country = explore_cat_values(dataframe = df, column = 'geoNetwork.country', target_column = target_log)
results_by_country
| n_rows | pct_rows | transactions | pct_transactions | mean_revenue_log | |
|---|---|---|---|---|---|
| geoNetwork.country | |||||
| Australia | 1309 | 0.014495 | 2 | 0.001528 | 4.329520 |
| Brazil | 2004 | 0.022191 | 1 | 0.000499 | 4.629765 |
| Canada | 2572 | 0.028481 | 19 | 0.007387 | 4.426030 |
| Chile | 192 | 0.002126 | 1 | 0.005208 | 2.889816 |
| Colombia | 464 | 0.005138 | 1 | 0.002155 | 3.711130 |
| Georgia | 91 | 0.001008 | 1 | 0.010989 | 3.135059 |
| Germany | 2018 | 0.022346 | 1 | 0.000496 | 3.663049 |
| Guatemala | 68 | 0.000753 | 1 | 0.014706 | 3.264996 |
| Ireland | 682 | 0.007552 | 2 | 0.002933 | 3.292852 |
| Italy | 1135 | 0.012568 | 1 | 0.000881 | 4.654912 |
| Mexico | 1280 | 0.014174 | 2 | 0.001563 | 4.903794 |
| Peru | 581 | 0.006434 | 1 | 0.001721 | 3.663562 |
| Puerto Rico | 79 | 0.000875 | 2 | 0.025316 | 3.852677 |
| Romania | 639 | 0.007076 | 1 | 0.001565 | 4.343286 |
| South Korea | 511 | 0.005659 | 1 | 0.001957 | 2.889816 |
| St. Lucia | 3 | 0.000033 | 1 | 0.333333 | 4.637250 |
| United Kingdom | 3805 | 0.042135 | 4 | 0.001051 | 4.056217 |
| United States | 36335 | 0.402354 | 1063 | 0.029256 | 4.071809 |
| Venezuela | 198 | 0.002193 | 5 | 0.025253 | 4.297889 |
Lo que quiero es coger aquellos paises que o bien compran mucho o bien compran muy poco. Quiero ver las situaciones más extremas.
La ordenación la podemos hacer de varias maneras:
# Ordeno por mean_revenue_log en orden ascendente
results_by_country.sort_values(by = 'pct_transactions')
| n_rows | pct_rows | transactions | pct_transactions | mean_revenue_log | |
|---|---|---|---|---|---|
| geoNetwork.country | |||||
| Germany | 2018 | 0.022346 | 1 | 0.000496 | 3.663049 |
| Brazil | 2004 | 0.022191 | 1 | 0.000499 | 4.629765 |
| Italy | 1135 | 0.012568 | 1 | 0.000881 | 4.654912 |
| United Kingdom | 3805 | 0.042135 | 4 | 0.001051 | 4.056217 |
| Australia | 1309 | 0.014495 | 2 | 0.001528 | 4.329520 |
| Mexico | 1280 | 0.014174 | 2 | 0.001563 | 4.903794 |
| Romania | 639 | 0.007076 | 1 | 0.001565 | 4.343286 |
| Peru | 581 | 0.006434 | 1 | 0.001721 | 3.663562 |
| South Korea | 511 | 0.005659 | 1 | 0.001957 | 2.889816 |
| Colombia | 464 | 0.005138 | 1 | 0.002155 | 3.711130 |
| Ireland | 682 | 0.007552 | 2 | 0.002933 | 3.292852 |
| Chile | 192 | 0.002126 | 1 | 0.005208 | 2.889816 |
| Canada | 2572 | 0.028481 | 19 | 0.007387 | 4.426030 |
| Georgia | 91 | 0.001008 | 1 | 0.010989 | 3.135059 |
| Guatemala | 68 | 0.000753 | 1 | 0.014706 | 3.264996 |
| Venezuela | 198 | 0.002193 | 5 | 0.025253 | 4.297889 |
| Puerto Rico | 79 | 0.000875 | 2 | 0.025316 | 3.852677 |
| United States | 36335 | 0.402354 | 1063 | 0.029256 | 4.071809 |
| St. Lucia | 3 | 0.000033 | 1 | 0.333333 | 4.637250 |
De momento voy a ordenar según el mean_revenue_log
# Ordeno por mean_revenue_log en orden ascendente
results_by_country.sort_values(by = 'mean_revenue_log')
| n_rows | pct_rows | transactions | pct_transactions | mean_revenue_log | |
|---|---|---|---|---|---|
| geoNetwork.country | |||||
| South Korea | 511 | 0.005659 | 1 | 0.001957 | 2.889816 |
| Chile | 192 | 0.002126 | 1 | 0.005208 | 2.889816 |
| Georgia | 91 | 0.001008 | 1 | 0.010989 | 3.135059 |
| Guatemala | 68 | 0.000753 | 1 | 0.014706 | 3.264996 |
| Ireland | 682 | 0.007552 | 2 | 0.002933 | 3.292852 |
| Germany | 2018 | 0.022346 | 1 | 0.000496 | 3.663049 |
| Peru | 581 | 0.006434 | 1 | 0.001721 | 3.663562 |
| Colombia | 464 | 0.005138 | 1 | 0.002155 | 3.711130 |
| Puerto Rico | 79 | 0.000875 | 2 | 0.025316 | 3.852677 |
| United Kingdom | 3805 | 0.042135 | 4 | 0.001051 | 4.056217 |
| United States | 36335 | 0.402354 | 1063 | 0.029256 | 4.071809 |
| Venezuela | 198 | 0.002193 | 5 | 0.025253 | 4.297889 |
| Australia | 1309 | 0.014495 | 2 | 0.001528 | 4.329520 |
| Romania | 639 | 0.007076 | 1 | 0.001565 | 4.343286 |
| Canada | 2572 | 0.028481 | 19 | 0.007387 | 4.426030 |
| Brazil | 2004 | 0.022191 | 1 | 0.000499 | 4.629765 |
| St. Lucia | 3 | 0.000033 | 1 | 0.333333 | 4.637250 |
| Italy | 1135 | 0.012568 | 1 | 0.000881 | 4.654912 |
| Mexico | 1280 | 0.014174 | 2 | 0.001563 | 4.903794 |
Los paises del medio son irrelevantes para el modelo. Me interesan los que se gastan mucho y los que se gastan poco (los extremos).
No me interesan los paises con pocas observaciones, como Santa Lucía. Ponemos un mínimo de 100 observaciones. Podemos jugar con este parámetro para ver como se comporta el algoritmo
results_by_country[results_by_country['n_rows'] > 100].sort_values(by = 'mean_revenue_log')
| n_rows | pct_rows | transactions | pct_transactions | mean_revenue_log | |
|---|---|---|---|---|---|
| geoNetwork.country | |||||
| Chile | 192 | 0.002126 | 1 | 0.005208 | 2.889816 |
| South Korea | 511 | 0.005659 | 1 | 0.001957 | 2.889816 |
| Ireland | 682 | 0.007552 | 2 | 0.002933 | 3.292852 |
| Germany | 2018 | 0.022346 | 1 | 0.000496 | 3.663049 |
| Peru | 581 | 0.006434 | 1 | 0.001721 | 3.663562 |
| Colombia | 464 | 0.005138 | 1 | 0.002155 | 3.711130 |
| United Kingdom | 3805 | 0.042135 | 4 | 0.001051 | 4.056217 |
| United States | 36335 | 0.402354 | 1063 | 0.029256 | 4.071809 |
| Venezuela | 198 | 0.002193 | 5 | 0.025253 | 4.297889 |
| Australia | 1309 | 0.014495 | 2 | 0.001528 | 4.329520 |
| Romania | 639 | 0.007076 | 1 | 0.001565 | 4.343286 |
| Canada | 2572 | 0.028481 | 19 | 0.007387 | 4.426030 |
| Brazil | 2004 | 0.022191 | 1 | 0.000499 | 4.629765 |
| Italy | 1135 | 0.012568 | 1 | 0.000881 | 4.654912 |
| Mexico | 1280 | 0.014174 | 2 | 0.001563 | 4.903794 |
# Me quedo con los 5 paises que menos gastan y los guardo en uan lista (el 5 es otro parámetro con el que podemos jugar)
last_countries = results_by_country[results_by_country['n_rows'] > 100].sort_values(by='mean_revenue_log').head().index.to_list()
last_countries
['Chile', 'South Korea', 'Ireland', 'Germany', 'Peru']
# Me quedo con los 5 paises que más gastan y los guardo en una lista
first_countries = results_by_country[results_by_country['n_rows'] > 100].sort_values(by='mean_revenue_log').tail().index.to_list()
first_countries
['Romania', 'Canada', 'Brazil', 'Italy', 'Mexico']
# junto las dos listas en una única lista --> lista de paises sobre los que quiero tener una etiqueta específica
# El resto de paises no me interesan ya que se comportan mas o menos como la media del target
country_list = last_countries + first_countries
country_list
['Chile', 'South Korea', 'Ireland', 'Germany', 'Peru', 'Romania', 'Canada', 'Brazil', 'Italy', 'Mexico']
Generamos una función con todos estos pasos --> setOthersPataNegra
Cogemos la función SetOthers y la modificamos
setOthers??
Signature: setOthers(dataframe, column, num_values) Source: def setOthers(dataframe, column, num_values): ''' Reduce el número de etiquetas. Agrupa las etiquetas de la variable categórica (column) que quedan fuera de las num_values primeras, en una única llamada 'Others', pasándole el dataframe. ''' # Me quedo con la lista de las primeras etiquetas (num_values) top_categories = dataframe[column].value_counts().head(num_values) top_categories_list = top_categories.index.to_list() # Añado a la lista la etiqueta 'Others' top_categories_list.append('Others') # Convierto a categórica sólo las etiquetas que le indico, de la variable (column) # Las otras etiquetas que no están en la lista, se convierten en nulos dataframe[column] = pd.Categorical( dataframe[column], categories = top_categories_list ) # Relleno los nulos con 'Others' y me devuelve las etiquetas de la variable (column) return dataframe[column].fillna('Others') # No es recomendable meter el inplace = True en el fillna. Es mejor guardar el resultado en una variable File: c:\users\jagui\appdata\local\temp\ipykernel_1840\2742398532.py Type: function
def setOthersPataNegra(dataframe, column, target_column, num_rows_min, top_n):
'''
Reduce el número de etiquetas.
Agrupa las etiquetas de la variable categórica (column) que tienen un mean_revenue_log parecido al target
en una única llamada 'Others', pasándole el dataframe.
'''
#Me quedo con la lista de las primeras etiquetas (num_values)
#top_categories = dataframe[column].value_counts().head(num_values)
#top_categories_list = top_categories.index.to_list()
# Tabla descriptiva de la variable categórica
results_by_category = explore_cat_values(dataframe, column, target_column)
# Lista de los 'top_n' paises que menos gastan (con un número mínimo de observaciones 'num_rows_min')
last_categories = results_by_category[results_by_category['n_rows'] > num_rows_min].sort_values(by='mean_revenue_log').head(top_n).index.to_list()
# Lista de los 'top_n' paises que más gastan (con un número mínimo de observaciones 'num_rows_min')
first_categories = results_by_category[results_by_category['n_rows'] > num_rows_min].sort_values(by='mean_revenue_log').tail(top_n).index.to_list()
# Lista de paises importantes para esta variable categórica
top_categories_list = first_categories + last_categories
# Añado a la lista la etiqueta 'Others' (donde voy a agrupar el resto de paises)
top_categories_list.append('Others')
# Convierto a categórica sólo las etiquetas que le indico, de la variable (column)
# Las otras etiquetas que no están en la lista, se convierten en nulos
dataframe[column] = pd.Categorical(
dataframe[column],
categories = top_categories_list
)
# Relleno los nulos con 'Others' y me devuelve las etiquetas de la variable (column)
return dataframe[column].fillna('Others') # No es recomendable meter el inplace = True en el fillna. Es mejor guardar el resultado en una variable
Incluso podemos parametrizar la variable según lo ordeno ('mean_revenue_log' o 'pct_transactions')
df['geoNetwork.country'] = setOthersPataNegra(dataframe = df, column = 'geoNetwork.country', target_column = target_log, num_rows_min = 100, top_n = 5)
df['geoNetwork.country']
sessionId
1438082600262726746_1472803483 Others
1283542838194038522_1472885255 Others
4339756682310369249_1472828340 Others
062441254657008214_1472875520 Others
1381975521299261523_1472829727 South Korea
...
8316382343226738015_1483549157 Others
6636384798982309878_1483525429 Others
469840327005431380_1483573235 Germany
2140149974339316233_1483557808 Others
5123779100307500332_1483554750 Others
Name: geoNetwork.country, Length: 90306, dtype: category
Categories (11, object): ['Romania', 'Canada', 'Brazil', 'Italy', ..., 'Ireland', 'Germany', 'Peru', 'Others']
11 etiquetas distintas ( 5 por arriba, 5 por abajo y el others)
df['geoNetwork.country'].value_counts()
Others 78692 Canada 2572 Germany 2018 Brazil 2004 Mexico 1280 Italy 1135 Ireland 682 Romania 639 Peru 581 South Korea 511 Chile 192 Name: geoNetwork.country, dtype: int64
Vemos que el Others tiene agrupadas la mayoría de las etiquetas que no le van a aportar nada al modelo. Lo vemos:
explore_cat_values(dataframe = df, column = 'geoNetwork.country', target_column = target_log).sort_values(by = 'mean_revenue_log')
| n_rows | pct_rows | transactions | pct_transactions | mean_revenue_log | |
|---|---|---|---|---|---|
| geoNetwork.country | |||||
| Chile | 192 | 0.002126 | 1 | 0.005208 | 2.889816 |
| South Korea | 511 | 0.005659 | 1 | 0.001957 | 2.889816 |
| Ireland | 682 | 0.007552 | 2 | 0.002933 | 3.292852 |
| Germany | 2018 | 0.022346 | 1 | 0.000496 | 3.663049 |
| Peru | 581 | 0.006434 | 1 | 0.001721 | 3.663562 |
| Others | 78692 | 0.871393 | 1080 | 0.013724 | 4.071444 |
| Romania | 639 | 0.007076 | 1 | 0.001565 | 4.343286 |
| Canada | 2572 | 0.028481 | 19 | 0.007387 | 4.426030 |
| Brazil | 2004 | 0.022191 | 1 | 0.000499 | 4.629765 |
| Italy | 1135 | 0.012568 | 1 | 0.000881 | 4.654912 |
| Mexico | 1280 | 0.014174 | 2 | 0.001563 | 4.903794 |
Others efectivamente se encuentra en el medio, por lo que no va a ser relevante para el modelo.
Con esto hemos reducido la variable a tan sólo 11 etiquetas. Ya podemos aplicar OHE
La etiqueta Others es redundante:
df[['geoNetwork.region', 'geoNetwork.metro', 'geoNetwork.city']].describe()
| geoNetwork.region | geoNetwork.metro | geoNetwork.city | |
|---|---|---|---|
| count | 90306 | 90306 | 90306 |
| unique | 332 | 86 | 542 |
| top | not available in demo dataset | not available in demo dataset | not available in demo dataset |
| freq | 50639 | 50639 | 50639 |
explore_cat_values(dataframe = df, column = 'geoNetwork.city', target_column = target_log).sort_values(by = 'mean_revenue_log')
| n_rows | pct_rows | transactions | pct_transactions | mean_revenue_log | |
|---|---|---|---|---|---|
| geoNetwork.city | |||||
| Boston | 98 | 0.001085 | 1 | 0.010204 | 2.659560 |
| Syracuse | 2 | 0.000022 | 1 | 0.500000 | 2.887590 |
| Dallas | 164 | 0.001816 | 1 | 0.006098 | 3.044046 |
| San Mateo | 37 | 0.000410 | 1 | 0.027027 | 3.112181 |
| Dublin | 516 | 0.005714 | 2 | 0.003876 | 3.292852 |
| Kitchener | 35 | 0.000388 | 1 | 0.028571 | 3.338613 |
| Kansas City | 11 | 0.000122 | 2 | 0.181818 | 3.353329 |
| Phoenix | 41 | 0.000454 | 1 | 0.024390 | 3.366606 |
| Dundalk | 3 | 0.000033 | 1 | 0.333333 | 3.366606 |
| Philadelphia | 85 | 0.000941 | 2 | 0.023529 | 3.500736 |
| Vancouver | 46 | 0.000509 | 1 | 0.021739 | 3.663305 |
| La Victoria | 175 | 0.001938 | 1 | 0.005714 | 3.663562 |
| Atlanta | 256 | 0.002835 | 14 | 0.054688 | 3.714678 |
| Palo Alto | 286 | 0.003167 | 8 | 0.027972 | 3.741527 |
| Salem | 230 | 0.002547 | 3 | 0.013043 | 3.746739 |
| San Jose | 1017 | 0.011262 | 15 | 0.014749 | 3.752433 |
| Kirkland | 204 | 0.002259 | 10 | 0.049020 | 3.760539 |
| Boulder | 43 | 0.000476 | 2 | 0.046512 | 3.762557 |
| Washington | 246 | 0.002724 | 13 | 0.052846 | 3.770209 |
| Sydney | 545 | 0.006035 | 1 | 0.001835 | 3.805996 |
| Seattle | 517 | 0.005725 | 27 | 0.052224 | 3.809356 |
| Fremont | 85 | 0.000941 | 3 | 0.035294 | 3.873751 |
| not available in demo dataset | 50639 | 0.560749 | 426 | 0.008412 | 3.899517 |
| Lenoir | 1 | 0.000011 | 1 | 1.000000 | 3.907613 |
| Cupertino | 65 | 0.000720 | 4 | 0.061538 | 3.913653 |
| Sunnyvale | 1328 | 0.014706 | 43 | 0.032380 | 3.933340 |
| Rio de Janeiro | 75 | 0.000831 | 1 | 0.013333 | 3.939444 |
| Lake Oswego | 17 | 0.000188 | 2 | 0.117647 | 3.948779 |
| Santa Clara | 311 | 0.003444 | 6 | 0.019293 | 3.955042 |
| Houston | 262 | 0.002901 | 4 | 0.015267 | 4.061015 |
| Evanston | 18 | 0.000199 | 1 | 0.055556 | 4.083452 |
| Council Bluffs | 8 | 0.000089 | 1 | 0.125000 | 4.097174 |
| Los Angeles | 863 | 0.009556 | 27 | 0.031286 | 4.101301 |
| Vienna | 55 | 0.000609 | 1 | 0.018182 | 4.110710 |
| London | 1340 | 0.014838 | 3 | 0.002239 | 4.111696 |
| Mountain View | 4046 | 0.044803 | 108 | 0.026693 | 4.177904 |
| Austin | 397 | 0.004396 | 17 | 0.042821 | 4.179867 |
| Irvine | 99 | 0.001096 | 4 | 0.040404 | 4.197886 |
| San Francisco | 2046 | 0.022656 | 79 | 0.038612 | 4.204252 |
| Portland | 28 | 0.000310 | 2 | 0.071429 | 4.223227 |
| Singapore | 351 | 0.003887 | 2 | 0.005698 | 4.290351 |
| Ann Arbor | 236 | 0.002613 | 20 | 0.084746 | 4.319889 |
| Pittsburgh | 96 | 0.001063 | 3 | 0.031250 | 4.320306 |
| Menlo Park | 9 | 0.000100 | 1 | 0.111111 | 4.322542 |
| New York | 2512 | 0.027817 | 143 | 0.056927 | 4.338790 |
| Bucharest | 131 | 0.001451 | 1 | 0.007634 | 4.343286 |
| Denver | 40 | 0.000443 | 1 | 0.025000 | 4.376260 |
| San Diego | 154 | 0.001705 | 8 | 0.051948 | 4.415505 |
| Jersey City | 29 | 0.000321 | 4 | 0.137931 | 4.470787 |
| Cambridge | 163 | 0.001805 | 12 | 0.073620 | 4.599574 |
| Chicago | 793 | 0.008781 | 43 | 0.054224 | 4.609583 |
| San Bruno | 149 | 0.001650 | 13 | 0.087248 | 4.611507 |
| Sao Paulo | 419 | 0.004640 | 1 | 0.002387 | 4.629765 |
| Orlando | 26 | 0.000288 | 1 | 0.038462 | 4.634243 |
| Maracaibo | 24 | 0.000266 | 4 | 0.166667 | 4.664193 |
| (not set) | 3489 | 0.038635 | 2 | 0.000573 | 4.745148 |
| Mexico City | 245 | 0.002713 | 1 | 0.004082 | 4.803775 |
| Toronto | 488 | 0.005404 | 8 | 0.016393 | 5.121778 |
| Sacramento | 6 | 0.000066 | 1 | 0.166667 | 5.208940 |
Podemos observar varias cosas:
Aplicamos la función setOthersPataNegra --> Nos quedamos las 3 por arriba y las 3 por abajo y el resto las agrupamos en Others
df['geoNetwork.city'] = setOthersPataNegra(dataframe = df, column = 'geoNetwork.city', target_column = target_log, num_rows_min = 100, top_n = 3)
df['geoNetwork.city']
sessionId
1438082600262726746_1472803483 Others
1283542838194038522_1472885255 Others
4339756682310369249_1472828340 Others
062441254657008214_1472875520 Others
1381975521299261523_1472829727 Others
...
8316382343226738015_1483549157 Others
6636384798982309878_1483525429 Others
469840327005431380_1483573235 Others
2140149974339316233_1483557808 Others
5123779100307500332_1483554750 Others
Name: geoNetwork.city, Length: 90306, dtype: category
Categories (7, object): ['(not set)', 'Mexico City', 'Toronto', 'Dallas', 'Dublin', 'La Victoria', 'Others']
df['geoNetwork.city'].value_counts()
Others 85229 (not set) 3489 Dublin 516 Toronto 488 Mexico City 245 La Victoria 175 Dallas 164 Name: geoNetwork.city, dtype: int64
explore_cat_values(dataframe = df, column = 'geoNetwork.city', target_column = target_log).sort_values(by = 'mean_revenue_log')
| n_rows | pct_rows | transactions | pct_transactions | mean_revenue_log | |
|---|---|---|---|---|---|
| geoNetwork.city | |||||
| Dallas | 164 | 0.001816 | 1 | 0.006098 | 3.044046 |
| Dublin | 516 | 0.005714 | 2 | 0.003876 | 3.292852 |
| La Victoria | 175 | 0.001938 | 1 | 0.005714 | 3.663562 |
| Others | 85229 | 0.943780 | 1095 | 0.012848 | 4.069242 |
| (not set) | 3489 | 0.038635 | 2 | 0.000573 | 4.745148 |
| Mexico City | 245 | 0.002713 | 1 | 0.004082 | 4.803775 |
| Toronto | 488 | 0.005404 | 8 | 0.016393 | 5.121778 |
Nos queda Others que son la inmensa mayoría en el medio, por lo que no va a ser relevante para el modelo y hacemos que se fije en las otras 6.
GeoNetwork - Metro
df['geoNetwork.metro'].head(20)
sessionId 1438082600262726746_1472803483 (not set) 1283542838194038522_1472885255 not available in demo dataset 4339756682310369249_1472828340 (not set) 062441254657008214_1472875520 not available in demo dataset 1381975521299261523_1472829727 (not set) 0808960480137718074_1472804813 (not set) 5637149591671308745_1472819775 (not set) 9870305678537397476_1472814953 not available in demo dataset 9271059985757878297_1472838507 not available in demo dataset 4283381413149200106_1472836538 (not set) 2870162892603227525_1472858877 not available in demo dataset 9459337971215135976_1472816813 not available in demo dataset 401634289917197318_1472830591 (not set) 2332957681418505787_1472804709 (not set) 240989783927851488_1472857884 San Francisco-Oakland-San Jose CA 8929725610124514851_1472803218 not available in demo dataset 9434259910512164814_1472827093 not available in demo dataset 7738788348584858595_1472811585 not available in demo dataset 8770237469311081313_1472857939 (not set) 92384702709597602_1472848131 (not set) Name: geoNetwork.metro, dtype: object
df['geoNetwork.metro'].describe()
count 90306 unique 86 top not available in demo dataset freq 50639 Name: geoNetwork.metro, dtype: object
explore_cat_values(dataframe = df, column = 'geoNetwork.metro', target_column = target_log).sort_values(by = 'mean_revenue_log')
| n_rows | pct_rows | transactions | pct_transactions | mean_revenue_log | |
|---|---|---|---|---|---|
| geoNetwork.metro | |||||
| Syracuse NY | 2 | 0.000022 | 1 | 0.500000 | 2.887590 |
| Dallas-Ft. Worth TX | 181 | 0.002004 | 1 | 0.005525 | 3.044046 |
| Kansas City MO | 12 | 0.000133 | 2 | 0.166667 | 3.353329 |
| Phoenix AZ | 54 | 0.000598 | 1 | 0.018519 | 3.366606 |
| Baltimore MD | 5 | 0.000055 | 1 | 0.200000 | 3.366606 |
| Philadelphia PA | 92 | 0.001019 | 2 | 0.021739 | 3.500736 |
| Atlanta GA | 265 | 0.002934 | 14 | 0.052830 | 3.714678 |
| Roanoke-Lynchburg VA | 230 | 0.002547 | 3 | 0.013043 | 3.746739 |
| Washington DC (Hagerstown MD) | 316 | 0.003499 | 13 | 0.041139 | 3.770209 |
| Seattle-Tacoma WA | 778 | 0.008615 | 37 | 0.047558 | 3.796162 |
| not available in demo dataset | 50639 | 0.560749 | 426 | 0.008412 | 3.899517 |
| Charlotte NC | 43 | 0.000476 | 1 | 0.023256 | 3.907613 |
| Denver CO | 86 | 0.000952 | 3 | 0.034884 | 3.967125 |
| Houston TX | 262 | 0.002901 | 4 | 0.015267 | 4.061015 |
| Portland OR | 138 | 0.001528 | 4 | 0.028986 | 4.086003 |
| Omaha NE | 13 | 0.000144 | 1 | 0.076923 | 4.097174 |
| London | 1337 | 0.014805 | 3 | 0.002244 | 4.111696 |
| Los Angeles CA | 1004 | 0.011118 | 31 | 0.030876 | 4.113763 |
| San Francisco-Oakland-San Jose CA | 9541 | 0.105652 | 281 | 0.029452 | 4.117766 |
| Austin TX | 397 | 0.004396 | 17 | 0.042821 | 4.179867 |
| Detroit MI | 247 | 0.002735 | 20 | 0.080972 | 4.319889 |
| Pittsburgh PA | 96 | 0.001063 | 3 | 0.031250 | 4.320306 |
| New York NY | 2581 | 0.028581 | 147 | 0.056955 | 4.342382 |
| San Diego CA | 156 | 0.001727 | 8 | 0.051282 | 4.415505 |
| Boston MA-Manchester NH | 268 | 0.002968 | 13 | 0.048507 | 4.450342 |
| (not set) | 20211 | 0.223806 | 27 | 0.001336 | 4.466154 |
| Chicago IL | 814 | 0.009014 | 44 | 0.054054 | 4.597625 |
| Orlando-Daytona Beach-Melbourne FL | 38 | 0.000421 | 1 | 0.026316 | 4.634243 |
| Sacramento-Stockton-Modesto CA | 6 | 0.000066 | 1 | 0.166667 | 5.208940 |
70000 de las 90000 observaciones no nos aporta información (not available in demo dataset o not set) y el resto de las etiquetas nos aporta información que es parecida a la que nos aporta city. La eliminamos
# La eliminamos
df.drop('geoNetwork.metro', axis=1, inplace=True)
GeoNetwork - Region
df['geoNetwork.region'].head(10)
sessionId 1438082600262726746_1472803483 Sindh 1283542838194038522_1472885255 not available in demo dataset 4339756682310369249_1472828340 Karnataka 062441254657008214_1472875520 not available in demo dataset 1381975521299261523_1472829727 Seoul 0808960480137718074_1472804813 North Holland 5637149591671308745_1472819775 Masovian Voivodeship 9870305678537397476_1472814953 not available in demo dataset 9271059985757878297_1472838507 not available in demo dataset 4283381413149200106_1472836538 New South Wales Name: geoNetwork.region, dtype: object
df['geoNetwork.region'].describe()
count 90306 unique 332 top not available in demo dataset freq 50639 Name: geoNetwork.region, dtype: object
explore_cat_values(dataframe = df, column = 'geoNetwork.region', target_column = target_log).sort_values(by = 'mean_revenue_log')
| n_rows | pct_rows | transactions | pct_transactions | mean_revenue_log | |
|---|---|---|---|---|---|
| geoNetwork.region | |||||
| County Dublin | 369 | 0.004086 | 2 | 0.005420 | 3.292852 |
| Missouri | 13 | 0.000144 | 2 | 0.153846 | 3.353329 |
| Arizona | 54 | 0.000598 | 1 | 0.018519 | 3.366606 |
| Maryland | 6 | 0.000066 | 1 | 0.166667 | 3.366606 |
| British Columbia | 70 | 0.000775 | 1 | 0.014286 | 3.663305 |
| Lima Region | 175 | 0.001938 | 1 | 0.005714 | 3.663562 |
| Georgia | 266 | 0.002946 | 14 | 0.052632 | 3.714678 |
| Virginia | 304 | 0.003366 | 3 | 0.009868 | 3.746739 |
| District of Columbia | 246 | 0.002724 | 13 | 0.052846 | 3.770209 |
| Washington | 778 | 0.008615 | 37 | 0.047558 | 3.796162 |
| New South Wales | 545 | 0.006035 | 1 | 0.001835 | 3.805996 |
| not available in demo dataset | 50639 | 0.560749 | 426 | 0.008412 | 3.899517 |
| North Carolina | 52 | 0.000576 | 1 | 0.019231 | 3.907613 |
| State of Rio de Janeiro | 75 | 0.000831 | 1 | 0.013333 | 3.939444 |
| Colorado | 89 | 0.000986 | 3 | 0.033708 | 3.967125 |
| Pennsylvania | 182 | 0.002015 | 5 | 0.027473 | 3.992478 |
| Oregon | 138 | 0.001528 | 4 | 0.028986 | 4.086003 |
| Iowa | 8 | 0.000089 | 1 | 0.125000 | 4.097174 |
| Texas | 885 | 0.009800 | 22 | 0.024859 | 4.106629 |
| Vienna | 55 | 0.000609 | 1 | 0.018182 | 4.110710 |
| England | 1411 | 0.015625 | 3 | 0.002126 | 4.111696 |
| California | 10728 | 0.118796 | 321 | 0.029922 | 4.128199 |
| Michigan | 268 | 0.002968 | 20 | 0.074627 | 4.319889 |
| New York | 2520 | 0.027905 | 144 | 0.057143 | 4.328712 |
| Bucharest | 131 | 0.001451 | 1 | 0.007634 | 4.343286 |
| Massachusetts | 271 | 0.003001 | 13 | 0.047970 | 4.450342 |
| New Jersey | 72 | 0.000797 | 4 | 0.055556 | 4.470787 |
| (not set) | 2863 | 0.031703 | 4 | 0.001397 | 4.517750 |
| Illinois | 814 | 0.009014 | 44 | 0.054054 | 4.597625 |
| State of Sao Paulo | 432 | 0.004784 | 1 | 0.002315 | 4.629765 |
| Florida | 62 | 0.000687 | 1 | 0.016129 | 4.634243 |
| Zulia | 24 | 0.000266 | 4 | 0.166667 | 4.664193 |
| Mexico City | 245 | 0.002713 | 1 | 0.004082 | 4.803775 |
| Ontario | 572 | 0.006334 | 9 | 0.015734 | 4.923649 |
Lo que haremos es eliminar la region
# La eliminamos
df.drop('geoNetwork.region', axis = 1, inplace = True)
df['geoNetwork.networkDomain'].head()
sessionId 1438082600262726746_1472803483 unknown.unknown 1283542838194038522_1472885255 broadband.hu 4339756682310369249_1472828340 unknown.unknown 062441254657008214_1472875520 uwa.edu.au 1381975521299261523_1472829727 unknown.unknown Name: geoNetwork.networkDomain, dtype: object
df['geoNetwork.networkDomain'].describe()
count 90306 unique 7394 top (not set) freq 24320 Name: geoNetwork.networkDomain, dtype: object
Tenemos 7394 etiquetas distintas
df['geoNetwork.networkDomain'].value_counts().head(50)
(not set) 24320 unknown.unknown 14545 comcast.net 2852 rr.com 1425 verizon.net 1353 ttnet.com.tr 1295 comcastbusiness.net 1007 hinet.net 761 virginm.net 606 cox.net 581 prod-infinitum.com.mx 568 3bb.co.th 566 btcentralplus.com 561 att.net 525 sbcglobal.net 523 optonline.net 515 google.com 511 totbb.net 481 asianet.co.th 456 vnpt.vn 444 rima-tde.net 434 pldt.net 384 amazonaws.com 378 t-ipconnect.de 377 telecomitalia.it 363 virtua.com.br 337 qwest.net 329 airtelbroadband.in 323 bell.ca 296 ztomy.com 288 bhn.net 287 superonline.net 271 actcorp.in 267 wanadoo.fr 249 spcsdns.net 245 ocn.ne.jp 237 shawcable.net 225 com 222 rogers.com 208 rdsnet.ro 205 gvt.net.br 202 optusnet.com.au 200 bezeqint.net 197 proxad.net 190 sfr.net 182 videotron.ca 178 ziggo.nl 172 cantv.net 163 telesp.net.br 159 mycingular.net 155 Name: geoNetwork.networkDomain, dtype: int64
Vamos a utilizar una técnica de Natural Languages Processing para hacer un conteo de palabras (de momento, no se si la información que voy a sacar va a ser relevante o no ... vamos probando cosas):
df['network_net'] = df['geoNetwork.networkDomain'].str.contains('.net', case = False).astype(int)
df['network_net'] # 0 no es .net y 1 es .net
sessionId
1438082600262726746_1472803483 0
1283542838194038522_1472885255 0
4339756682310369249_1472828340 0
062441254657008214_1472875520 0
1381975521299261523_1472829727 0
..
8316382343226738015_1483549157 1
6636384798982309878_1483525429 0
469840327005431380_1483573235 0
2140149974339316233_1483557808 1
5123779100307500332_1483554750 1
Name: network_net, Length: 90306, dtype: int32
Vemos como se comporta esta variable con respecto al target
# Tabla descriptiva de network_net
results_by_network_net = explore_cat_values(dataframe = df, column = 'network_net', target_column = target_log)
results_by_network_net
| n_rows | pct_rows | transactions | pct_transactions | mean_revenue_log | |
|---|---|---|---|---|---|
| network_net | |||||
| 0 | 64768 | 0.717206 | 807 | 0.012460 | 4.096480 |
| 1 | 25538 | 0.282794 | 303 | 0.011865 | 4.021527 |
El hecho de que el trafico venga de .net o no, no parece que sea muy relevante, porque los dos tienen comportamientos muy parecidos (pct_transactions y mean_revenue_log muy similares)
explore_cat_values(dataframe = df, column = 'network_net', target_column = target)
| n_rows | pct_rows | transactions | pct_transactions | mean_revenue_log | |
|---|---|---|---|---|---|
| network_net | |||||
| 0 | 64768 | 0.717206 | 807 | 0.012460 | 126.397100 |
| 1 | 25538 | 0.282794 | 303 | 0.011865 | 134.959802 |
Podemos comprobar tambien la existencia o no de outliers comparando las dos versiones del target (target y target_log):
df['network_com'] = df['geoNetwork.networkDomain'].str.contains('.com', case = False).astype(int)
df['network_com'] # 0 no es .com y 1 es .com
sessionId
1438082600262726746_1472803483 0
1283542838194038522_1472885255 0
4339756682310369249_1472828340 0
062441254657008214_1472875520 0
1381975521299261523_1472829727 0
..
8316382343226738015_1483549157 0
6636384798982309878_1483525429 0
469840327005431380_1483573235 0
2140149974339316233_1483557808 0
5123779100307500332_1483554750 0
Name: network_com, Length: 90306, dtype: int32
# Tabla descriptiva de network_com
results_by_network_com = explore_cat_values(dataframe = df, column = 'network_com', target_column = target_log)
results_by_network_com
| n_rows | pct_rows | transactions | pct_transactions | mean_revenue_log | |
|---|---|---|---|---|---|
| network_com | |||||
| 0 | 75972 | 0.841273 | 1001 | 0.013176 | 4.080610 |
| 1 | 14334 | 0.158727 | 109 | 0.007604 | 4.033859 |
results_by_network_com[results_by_network_com['n_rows']>500]
| n_rows | pct_rows | transactions | pct_transactions | mean_revenue_log | |
|---|---|---|---|---|---|
| network_com | |||||
| 0 | 75972 | 0.841273 | 1001 | 0.013176 | 4.080610 |
| 1 | 14334 | 0.158727 | 109 | 0.007604 | 4.033859 |
df['geoNetwork.networkDomain']
sessionId
1438082600262726746_1472803483 unknown.unknown
1283542838194038522_1472885255 broadband.hu
4339756682310369249_1472828340 unknown.unknown
062441254657008214_1472875520 uwa.edu.au
1381975521299261523_1472829727 unknown.unknown
...
8316382343226738015_1483549157 completel.net
6636384798982309878_1483525429 unknown.unknown
469840327005431380_1483573235 (not set)
2140149974339316233_1483557808 tedata.net
5123779100307500332_1483554750 prtc.net
Name: geoNetwork.networkDomain, Length: 90306, dtype: object
Otra cosa que podríamos hacer es, de todos los dominios, quedarnos con los que tienen más de 500 observaciones y luego aplicar un setOther, quedándonos con los 10 que más observaciones tiene y agrupando el resto en una etiqueta llamada Others.
# Tabla descriptiva de networkDomain
results_by_network = explore_cat_values(dataframe = df, column = 'geoNetwork.networkDomain', target_column = target_log)
results_by_network
| n_rows | pct_rows | transactions | pct_transactions | mean_revenue_log | |
|---|---|---|---|---|---|
| geoNetwork.networkDomain | |||||
| (not set) | 24320 | 0.269307 | 635 | 0.026110 | 4.135669 |
| 150pelican.com | 1 | 0.000011 | 1 | 1.000000 | 5.768321 |
| 2uibe.com | 2 | 0.000022 | 1 | 0.500000 | 2.889816 |
| abbott.com | 1 | 0.000011 | 1 | 1.000000 | 6.293050 |
| acgtexas.com | 1 | 0.000011 | 1 | 1.000000 | 5.911068 |
| ... | ... | ... | ... | ... | ... |
| veracitynetworks.com | 5 | 0.000055 | 2 | 0.400000 | 4.852838 |
| verizon.net | 1353 | 0.014982 | 52 | 0.038433 | 4.109215 |
| vocus.net.au | 4 | 0.000044 | 1 | 0.250000 | 3.805996 |
| web-pass.com | 36 | 0.000399 | 1 | 0.027778 | 4.581492 |
| ztomy.com | 288 | 0.003189 | 9 | 0.031250 | 3.699165 |
130 rows × 5 columns
# Me quedo con los dominios que tienen más de 500 observaciones
results_by_network[results_by_network['n_rows'] > 500]
| n_rows | pct_rows | transactions | pct_transactions | mean_revenue_log | |
|---|---|---|---|---|---|
| geoNetwork.networkDomain | |||||
| (not set) | 24320 | 0.269307 | 635 | 0.026110 | 4.135669 |
| att.net | 525 | 0.005814 | 4 | 0.007619 | 3.307206 |
| comcast.net | 2852 | 0.031582 | 74 | 0.025947 | 4.002050 |
| comcastbusiness.net | 1007 | 0.011151 | 34 | 0.033764 | 4.087657 |
| cox.net | 581 | 0.006434 | 15 | 0.025818 | 4.270334 |
| optonline.net | 515 | 0.005703 | 13 | 0.025243 | 4.376403 |
| prod-infinitum.com.mx | 568 | 0.006290 | 1 | 0.001761 | 5.003812 |
| rr.com | 1425 | 0.015780 | 34 | 0.023860 | 3.721483 |
| sbcglobal.net | 523 | 0.005791 | 21 | 0.040153 | 3.614204 |
| unknown.unknown | 14545 | 0.161063 | 40 | 0.002750 | 3.812260 |
| verizon.net | 1353 | 0.014982 | 52 | 0.038433 | 4.109215 |
setOthers??
Signature: setOthers(dataframe, column, num_values) Source: def setOthers(dataframe, column, num_values): ''' Reduce el número de etiquetas. Agrupa las etiquetas de la variable categórica (column) que quedan fuera de las num_values primeras, en una única llamada 'Others', pasándole el dataframe. ''' # Me quedo con la lista de las primeras etiquetas (num_values) top_categories = dataframe[column].value_counts().head(num_values) top_categories_list = top_categories.index.to_list() # Añado a la lista la etiqueta 'Others' top_categories_list.append('Others') # Convierto a categórica sólo las etiquetas que le indico, de la variable (column) # Las otras etiquetas que no están en la lista, se convierten en nulos dataframe[column] = pd.Categorical( dataframe[column], categories = top_categories_list ) # Relleno los nulos con 'Others' y me devuelve las etiquetas de la variable (column) return dataframe[column].fillna('Others') # No es recomendable meter el inplace = True en el fillna. Es mejor guardar el resultado en una variable File: c:\users\jagui\appdata\local\temp\ipykernel_1840\2742398532.py Type: function
df['geoNetwork.networkDomain'] = setOthers(dataframe = df, column = 'geoNetwork.networkDomain', num_values = 10)
df['geoNetwork.networkDomain']
sessionId
1438082600262726746_1472803483 unknown.unknown
1283542838194038522_1472885255 Others
4339756682310369249_1472828340 unknown.unknown
062441254657008214_1472875520 Others
1381975521299261523_1472829727 unknown.unknown
...
8316382343226738015_1483549157 Others
6636384798982309878_1483525429 unknown.unknown
469840327005431380_1483573235 (not set)
2140149974339316233_1483557808 Others
5123779100307500332_1483554750 Others
Name: geoNetwork.networkDomain, Length: 90306, dtype: category
Categories (11, object): ['(not set)', 'unknown.unknown', 'comcast.net', 'rr.com', ..., 'hinet.net', 'virginm.net', 'cox.net', 'Others']
Sin embargo, como ya hemos sacado la información que queríamos de la variable, al ser categórica, podemos eliminarla.
df.drop(['geoNetwork.networkDomain'], axis = 1, inplace = True)
# La eliminamos
df.drop(['Unnamed: 0'], axis = 1, inplace = True)
df['totals.bounces'].value_counts(dropna = False)
NaN 45333 1 44973 Name: totals.bounces, dtype: int64
df['totals.newVisits'].value_counts(dropna = False)
1 70392 NaN 19914 Name: totals.newVisits, dtype: int64
Ambas variables aparecen como object, pero yo se que son booleanas 1/0. Podría tratarlas de dos formas:
Por lo tanto, lo único que haremos con estas variables es aplicarle un OHE
# Vemos lo que nos ha quedado
df.info()
<class 'pandas.core.frame.DataFrame'> Index: 90306 entries, 1438082600262726746_1472803483 to 5123779100307500332_1483554750 Data columns (total 27 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 channelGrouping 90306 non-null object 1 visitNumber 90306 non-null int64 2 device.browser 90306 non-null category 3 device.operatingSystem 90306 non-null category 4 device.isMobile 90306 non-null int32 5 device.deviceCategory 90306 non-null object 6 geoNetwork.continent 90306 non-null object 7 geoNetwork.subContinent 90306 non-null object 8 geoNetwork.country 90306 non-null category 9 geoNetwork.city 90306 non-null category 10 totals.hits 90306 non-null int32 11 totals.bounces 44973 non-null object 12 totals.newVisits 70392 non-null object 13 totals.transactionRevenue 90306 non-null float64 14 visitWithTransaction 90306 non-null int32 15 totals.transactionRevenue_log 90306 non-null float64 16 year 90306 non-null int64 17 month 90306 non-null int64 18 monthDay 90306 non-null int64 19 weekDay 90306 non-null int64 20 quarter 90306 non-null int64 21 week 90306 non-null int64 22 visitHour 90306 non-null int64 23 visitNumber_log 90306 non-null float64 24 totals.hits_log 90306 non-null float64 25 network_net 90306 non-null int32 26 network_com 90306 non-null int32 dtypes: category(4), float64(4), int32(5), int64(8), object(6) memory usage: 17.2+ MB
# Variables numéricas
df.describe(include = np.number).T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| visitNumber | 90306.0 | 2.254269 | 9.102378 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 373.000000 |
| device.isMobile | 90306.0 | 0.262829 | 0.440173 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 |
| totals.hits | 90306.0 | 4.596251 | 9.670030 | 1.000000 | 1.000000 | 2.000000 | 4.000000 | 500.000000 |
| totals.transactionRevenue | 90306.0 | 1.582345 | 32.383189 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 4198.500000 |
| visitWithTransaction | 90306.0 | 0.012292 | 0.110184 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| totals.transactionRevenue_log | 90306.0 | 0.050101 | 0.466902 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 8.342721 |
| year | 90306.0 | 2016.518116 | 0.499674 | 2016.000000 | 2016.000000 | 2017.000000 | 2017.000000 | 2017.000000 |
| month | 90306.0 | 6.980710 | 3.483099 | 1.000000 | 4.000000 | 7.000000 | 10.000000 | 12.000000 |
| monthDay | 90306.0 | 15.682291 | 8.813583 | 1.000000 | 8.000000 | 16.000000 | 23.000000 | 31.000000 |
| weekDay | 90306.0 | 2.736496 | 1.929640 | 0.000000 | 1.000000 | 3.000000 | 4.000000 | 6.000000 |
| quarter | 90306.0 | 2.666711 | 1.139098 | 1.000000 | 2.000000 | 3.000000 | 4.000000 | 4.000000 |
| week | 90306.0 | 28.441765 | 15.109857 | 1.000000 | 15.000000 | 30.000000 | 42.000000 | 52.000000 |
| visitHour | 90306.0 | 12.543452 | 7.069903 | 0.000000 | 6.000000 | 14.000000 | 19.000000 | 23.000000 |
| visitNumber_log | 90306.0 | 0.884559 | 0.486897 | 0.693147 | 0.693147 | 0.693147 | 0.693147 | 5.924256 |
| totals.hits_log | 90306.0 | 1.266027 | 0.785211 | 0.693147 | 0.693147 | 1.098612 | 1.609438 | 6.216606 |
| network_net | 90306.0 | 0.282794 | 0.450360 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 |
| network_com | 90306.0 | 0.158727 | 0.365423 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
# Variables categóricas
df.describe(exclude = np.number).T
| count | unique | top | freq | |
|---|---|---|---|---|
| channelGrouping | 90306 | 8 | Organic Search | 38445 |
| device.browser | 90306 | 6 | Chrome | 62147 |
| device.operatingSystem | 90306 | 7 | Windows | 35174 |
| device.deviceCategory | 90306 | 3 | desktop | 66572 |
| geoNetwork.continent | 90306 | 6 | Americas | 44890 |
| geoNetwork.subContinent | 90306 | 23 | Northern America | 38911 |
| geoNetwork.country | 90306 | 11 | Others | 78692 |
| geoNetwork.city | 90306 | 7 | Others | 85229 |
| totals.bounces | 44973 | 1 | 1 | 44973 |
| totals.newVisits | 70392 | 1 | 1 | 70392 |
Nos queda por hacer:
def OHE(dataframe, column_name):
'''
Aplica un One Hot Encoder a la variable categórica column_name
'''
_dummy_dataset = pd.get_dummies(dataframe[column_name], prefix = column_name) # Crea una variable booleana de cada etiqueta de la variable column_name
dataframe = pd.concat([dataframe, _dummy_dataset], axis = 1) # Uno las variables creadas al dataframe
return dataframe.drop(column_name, axis = 1) # Elimino la variable categórica column_name
df.select_dtypes(exclude=np.number).columns
Index(['channelGrouping', 'device.browser', 'device.operatingSystem',
'device.deviceCategory', 'geoNetwork.continent',
'geoNetwork.subContinent', 'geoNetwork.country', 'geoNetwork.city',
'totals.bounces', 'totals.newVisits'],
dtype='object')
for column in df.select_dtypes(exclude=np.number).columns:
print(f'{column}') # Imprimimos el nombre de la columna (y no lo que contiene) para ver que lo está haciendo bien
channelGrouping device.browser device.operatingSystem device.deviceCategory geoNetwork.continent geoNetwork.subContinent geoNetwork.country geoNetwork.city totals.bounces totals.newVisits
OHE(dataframe = df, column_name = 'channelGrouping').T
| sessionId | 1438082600262726746_1472803483 | 1283542838194038522_1472885255 | 4339756682310369249_1472828340 | 062441254657008214_1472875520 | 1381975521299261523_1472829727 | 0808960480137718074_1472804813 | 5637149591671308745_1472819775 | 9870305678537397476_1472814953 | 9271059985757878297_1472838507 | 4283381413149200106_1472836538 | ... | 8869363093179043393_1483601927 | 4676831114906257190_1483526811 | 3591531861629139100_1483535166 | 2042198043275420503_1483540075 | 6095547410502786759_1483540412 | 8316382343226738015_1483549157 | 6636384798982309878_1483525429 | 469840327005431380_1483573235 | 2140149974339316233_1483557808 | 5123779100307500332_1483554750 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| visitNumber | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | ... | 2 | 1 | 2 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
| device.browser | Safari | Safari | Chrome | Chrome | Chrome | Chrome | Chrome | Chrome | Edge | Safari | ... | Chrome | Chrome | Chrome | Edge | Chrome | Chrome | Chrome | Internet Explorer | Chrome | Chrome |
| device.operatingSystem | iOS | Macintosh | Android | Windows | Macintosh | Macintosh | Windows | Windows | Windows | iOS | ... | Android | Windows | Macintosh | Windows | Macintosh | Macintosh | Windows | Windows | Windows | Windows |
| device.isMobile | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| device.deviceCategory | mobile | desktop | mobile | desktop | desktop | desktop | desktop | desktop | desktop | mobile | ... | mobile | desktop | desktop | desktop | desktop | desktop | desktop | desktop | desktop | desktop |
| geoNetwork.continent | Asia | Europe | Asia | Oceania | Asia | Europe | Europe | Europe | Europe | Oceania | ... | Europe | Asia | Europe | Europe | Americas | Europe | Asia | Europe | Africa | Americas |
| geoNetwork.subContinent | Southern Asia | Eastern Europe | Southern Asia | Australasia | Eastern Asia | Western Europe | Eastern Europe | Western Europe | Eastern Europe | Australasia | ... | Northern Europe | Southern Asia | Northern Europe | Southern Europe | Northern America | Western Europe | Western Asia | Western Europe | Northern Africa | Caribbean |
| geoNetwork.country | Others | Others | Others | Others | South Korea | Others | Others | Others | Others | Others | ... | Others | Others | Others | Italy | Others | Others | Others | Germany | Others | Others |
| geoNetwork.city | Others | Others | Others | Others | Others | Others | Others | Others | Others | Others | ... | Others | Others | Others | Others | Others | Others | Others | Others | Others | Others |
| totals.hits | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | ... | 2 | 2 | 2 | 3 | 3 | 3 | 4 | 9 | 16 | 17 |
| totals.bounces | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| totals.newVisits | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | ... | NaN | 1 | NaN | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
| totals.transactionRevenue | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| visitWithTransaction | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| totals.transactionRevenue_log | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| year | 2016 | 2016 | 2016 | 2016 | 2016 | 2016 | 2016 | 2016 | 2016 | 2016 | ... | 2017 | 2017 | 2017 | 2017 | 2017 | 2017 | 2017 | 2017 | 2017 | 2017 |
| month | 9 | 9 | 9 | 9 | 9 | 9 | 9 | 9 | 9 | 9 | ... | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
| monthDay | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | ... | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 |
| weekDay | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | ... | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 |
| quarter | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | ... | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
| week | 35 | 35 | 35 | 35 | 35 | 35 | 35 | 35 | 35 | 35 | ... | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
| visitHour | 10 | 8 | 16 | 6 | 17 | 10 | 14 | 13 | 19 | 19 | ... | 8 | 11 | 14 | 15 | 15 | 17 | 11 | 0 | 20 | 19 |
| visitNumber_log | 0.693147 | 0.693147 | 0.693147 | 0.693147 | 0.693147 | 0.693147 | 0.693147 | 0.693147 | 0.693147 | 0.693147 | ... | 1.098612 | 0.693147 | 1.098612 | 0.693147 | 0.693147 | 0.693147 | 0.693147 | 0.693147 | 0.693147 | 0.693147 |
| totals.hits_log | 0.693147 | 0.693147 | 0.693147 | 0.693147 | 0.693147 | 0.693147 | 0.693147 | 0.693147 | 0.693147 | 0.693147 | ... | 1.098612 | 1.098612 | 1.098612 | 1.386294 | 1.386294 | 1.386294 | 1.609438 | 2.302585 | 2.833213 | 2.890372 |
| network_net | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
| network_com | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| channelGrouping_(Other) | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| channelGrouping_Affiliates | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| channelGrouping_Direct | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| channelGrouping_Display | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| channelGrouping_Organic Search | 1 | 1 | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| channelGrouping_Paid Search | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| channelGrouping_Referral | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| channelGrouping_Social | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
34 rows × 90306 columns
Vemos que si aplico OHE sin más, me lo imprime por pantalla, pero yo quiero agregárselo al dataframe, por lo que lo guardo en df
df = OHE(dataframe = df, column_name = 'channelGrouping')
df
| visitNumber | device.browser | device.operatingSystem | device.isMobile | device.deviceCategory | geoNetwork.continent | geoNetwork.subContinent | geoNetwork.country | geoNetwork.city | totals.hits | ... | network_net | network_com | channelGrouping_(Other) | channelGrouping_Affiliates | channelGrouping_Direct | channelGrouping_Display | channelGrouping_Organic Search | channelGrouping_Paid Search | channelGrouping_Referral | channelGrouping_Social | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| sessionId | |||||||||||||||||||||
| 1438082600262726746_1472803483 | 1 | Safari | iOS | 1 | mobile | Asia | Southern Asia | Others | Others | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 1283542838194038522_1472885255 | 1 | Safari | Macintosh | 0 | desktop | Europe | Eastern Europe | Others | Others | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 4339756682310369249_1472828340 | 1 | Chrome | Android | 1 | mobile | Asia | Southern Asia | Others | Others | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 062441254657008214_1472875520 | 1 | Chrome | Windows | 0 | desktop | Oceania | Australasia | Others | Others | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 1381975521299261523_1472829727 | 1 | Chrome | Macintosh | 0 | desktop | Asia | Eastern Asia | South Korea | Others | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8316382343226738015_1483549157 | 1 | Chrome | Macintosh | 0 | desktop | Europe | Western Europe | Others | Others | 3 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 6636384798982309878_1483525429 | 1 | Chrome | Windows | 0 | desktop | Asia | Western Asia | Others | Others | 4 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 469840327005431380_1483573235 | 1 | Internet Explorer | Windows | 0 | desktop | Europe | Western Europe | Germany | Others | 9 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 2140149974339316233_1483557808 | 1 | Chrome | Windows | 0 | desktop | Africa | Northern Africa | Others | Others | 16 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 5123779100307500332_1483554750 | 1 | Chrome | Windows | 0 | desktop | Americas | Caribbean | Others | Others | 17 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
90306 rows × 34 columns
# Hago un for loop para aplicar el OHE a todas las variables categóricas
for column in df.select_dtypes(exclude=np.number).columns:
df = OHE(dataframe = df, column_name = column)
df.info(verbose = False)
<class 'pandas.core.frame.DataFrame'> Index: 90306 entries, 1438082600262726746_1472803483 to 5123779100307500332_1483554750 Columns: 90 entries, visitNumber to totals.newVisits_1 dtypes: float64(4), int32(5), int64(8), uint8(73) memory usage: 19.0+ MB
Ya no tengo ninguna variable categórica --> tenemos 91 variables numéricas
Tenemos 3 variables relacionadas con el target:
Si dejamos estas tres variables, la predicción sería buenísima porque el algoritmo cojería aleatoriamente una de ellas, vería que correlaciona perfecto con el target y la utilizaría para hacer la predicción, pero nuestro modelo no serviría para nada. Hay que eliminar estas variables
NOTA: Si hacemos un modelo que predice perfectamente, lo más seguro es que se nos haya colado alguna variable de estas.
# Quitamos el visitWithTransaction porque no estamos en clasificación
# Quitamos la versión no transformada del target totals.transactionRevenue
target_linked_features = ['totals.transactionRevenue', 'visitWithTransaction']
Podríamos hacer también el modelling con totals.transactionRevenue y compararlo con el modelling utilizando el totals.transactionRevenue_log
# Eliminamos las variables correlacionadas con el target
df.drop(target_linked_features, axis = 1, inplace = True)
Para elegir la partición de validación escogemos una parte del dataset que se vaya a parecer a la realidad:
Por lo tanto, lo más habitual es utilizar las variables de fechas para hacer la partición.
df.pivot_table(
index = ['year', 'month'], # Variables con las que quiero agrupar
values = target_log, # target
aggfunc = len) # número de observaciones
| totals.transactionRevenue_log | ||
|---|---|---|
| year | month | |
| 2016 | 8 | 7484 |
| 9 | 7137 | |
| 10 | 9638 | |
| 11 | 11386 | |
| 12 | 7872 | |
| 2017 | 1 | 6499 |
| 2 | 6215 | |
| 3 | 6944 | |
| 4 | 6706 | |
| 5 | 6619 | |
| 6 | 6353 | |
| 7 | 7217 | |
| 8 | 236 |
# Hago la validación con los últimos tres meses
df_val = df[df['year'] * 100 + df['month'] >= 201706] # Validation split --> no aleatorio
df_dev = df[df['year'] * 100 + df['month'] < 201706] # Development split (train/test)
df_val.info(verbose = False)
<class 'pandas.core.frame.DataFrame'> Index: 13806 entries, 9909254185037943559_1498221316 to 8383966530038205596_1499608861 Columns: 88 entries, visitNumber to totals.newVisits_1 dtypes: float64(3), int32(4), int64(8), uint8(73) memory usage: 2.4+ MB
df_dev.info(verbose = False)
<class 'pandas.core.frame.DataFrame'> Index: 76500 entries, 1438082600262726746_1472803483 to 5123779100307500332_1483554750 Columns: 88 entries, visitNumber to totals.newVisits_1 dtypes: float64(3), int32(4), int64(8), uint8(73) memory usage: 13.5+ MB
Separamos los atributos y el target en ambas paticiones
# OJO --> No ponemos inplace = True
df_val_X = df_val.drop(target_log, axis = 1) # Sólo atributos en validación
df_val_y = df_val[[target_log]] # sólo target en validación
df_dev_X = df_dev.drop(target_log, axis = 1) # Sólo atributoos en development
df_dev_y = df_dev[[target_log]] # sólo target en development
df_val_X.info(verbose=False)
<class 'pandas.core.frame.DataFrame'> Index: 13806 entries, 9909254185037943559_1498221316 to 8383966530038205596_1499608861 Columns: 87 entries, visitNumber to totals.newVisits_1 dtypes: float64(2), int32(4), int64(8), uint8(73) memory usage: 2.3+ MB
df_val_y.info(verbose=False)
<class 'pandas.core.frame.DataFrame'> Index: 13806 entries, 9909254185037943559_1498221316 to 8383966530038205596_1499608861 Columns: 1 entries, totals.transactionRevenue_log to totals.transactionRevenue_log dtypes: float64(1) memory usage: 215.7+ KB
df_dev_X.info(verbose=False)
<class 'pandas.core.frame.DataFrame'> Index: 76500 entries, 1438082600262726746_1472803483 to 5123779100307500332_1483554750 Columns: 87 entries, visitNumber to totals.newVisits_1 dtypes: float64(2), int32(4), int64(8), uint8(73) memory usage: 12.9+ MB
df_dev_y.info(verbose=False)
<class 'pandas.core.frame.DataFrame'> Index: 76500 entries, 1438082600262726746_1472803483 to 5123779100307500332_1483554750 Columns: 1 entries, totals.transactionRevenue_log to totals.transactionRevenue_log dtypes: float64(1) memory usage: 1.2+ MB
Development en train/test --> (Random Holdout)
Para separar el development en train y test, utilizamos el model_selection de scikit-learn, con ello obtendremos dos particiones aleatorias, pero que tendrán la misma distribución. Al hacer esto, nos aseguramos que si el modelo funciona bien en train pero mal en train, es debido a que está memorizando. Si las distribuciones son distintas, no sabríamos si es debido a que tienen diferentes distribuciones o que está oferfiteando.
model_selection.train_test_split??
Signature: model_selection.train_test_split( *arrays, test_size=None, train_size=None, random_state=None, shuffle=True, stratify=None, ) Source: @validate_params( { "test_size": [ Interval(RealNotInt, 0, 1, closed="neither"), Interval(numbers.Integral, 1, None, closed="left"), None, ], "train_size": [ Interval(RealNotInt, 0, 1, closed="neither"), Interval(numbers.Integral, 1, None, closed="left"), None, ], "random_state": ["random_state"], "shuffle": ["boolean"], "stratify": ["array-like", None], }, prefer_skip_nested_validation=True, ) def train_test_split( *arrays, test_size=None, train_size=None, random_state=None, shuffle=True, stratify=None, ): """Split arrays or matrices into random train and test subsets. Quick utility that wraps input validation, ``next(ShuffleSplit().split(X, y))``, and application to input data into a single call for splitting (and optionally subsampling) data into a one-liner. Read more in the :ref:`User Guide <cross_validation>`. Parameters ---------- *arrays : sequence of indexables with same length / shape[0] Allowed inputs are lists, numpy arrays, scipy-sparse matrices or pandas dataframes. test_size : float or int, default=None If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. If int, represents the absolute number of test samples. If None, the value is set to the complement of the train size. If ``train_size`` is also None, it will be set to 0.25. train_size : float or int, default=None If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the train split. If int, represents the absolute number of train samples. If None, the value is automatically set to the complement of the test size. random_state : int, RandomState instance or None, default=None Controls the shuffling applied to the data before applying the split. Pass an int for reproducible output across multiple function calls. See :term:`Glossary <random_state>`. shuffle : bool, default=True Whether or not to shuffle the data before splitting. If shuffle=False then stratify must be None. stratify : array-like, default=None If not None, data is split in a stratified fashion, using this as the class labels. Read more in the :ref:`User Guide <stratification>`. Returns ------- splitting : list, length=2 * len(arrays) List containing train-test split of inputs. .. versionadded:: 0.16 If the input is sparse, the output will be a ``scipy.sparse.csr_matrix``. Else, output type is the same as the input type. Examples -------- >>> import numpy as np >>> from sklearn.model_selection import train_test_split >>> X, y = np.arange(10).reshape((5, 2)), range(5) >>> X array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]]) >>> list(y) [0, 1, 2, 3, 4] >>> X_train, X_test, y_train, y_test = train_test_split( ... X, y, test_size=0.33, random_state=42) ... >>> X_train array([[4, 5], [0, 1], [6, 7]]) >>> y_train [2, 0, 3] >>> X_test array([[2, 3], [8, 9]]) >>> y_test [1, 4] >>> train_test_split(y, shuffle=False) [[0, 1, 2], [3, 4]] """ n_arrays = len(arrays) if n_arrays == 0: raise ValueError("At least one array required as input") arrays = indexable(*arrays) n_samples = _num_samples(arrays[0]) n_train, n_test = _validate_shuffle_split( n_samples, test_size, train_size, default_test_size=0.25 ) if shuffle is False: if stratify is not None: raise ValueError( "Stratified train/test split is not implemented for shuffle=False" ) train = np.arange(n_train) test = np.arange(n_train, n_train + n_test) else: if stratify is not None: CVClass = StratifiedShuffleSplit else: CVClass = ShuffleSplit cv = CVClass(test_size=n_test, train_size=n_train, random_state=random_state) train, test = next(cv.split(X=arrays[0], y=stratify)) return list( chain.from_iterable( (_safe_indexing(a, train), _safe_indexing(a, test)) for a in arrays ) ) File: c:\users\jagui\anaconda3\lib\site-packages\sklearn\model_selection\_split.py Type: function
# train/test split --> partición aleatoria del development en train y test (nos aseguramos que ambas tienen la misma distribución)
X_train, X_test, y_train, y_test = model_selection.train_test_split(
df_dev_X, # X
df_dev_y, # y
test_size = 0.2, # tamaño del test split (para que el tamaño sea similar al de validación. Y vamos muy sobrados)
random_state = RANDOM_STATE, # aleatoriedad de la muestra
shuffle = True, # Mezcla las orservaciones antes de hacer la partición (por si el dataset tuviese algún tipo de orden)
stratify = None # El dataset es suficientemente grande. No hace falta este parámetro
)
stratify --> en caso de tener datsets muy pequeños, podemos meter en el stratify el target --> stratify = df_val_y
Lo que hace es mirar la distribución del target y a la hora de hacer el train/test split trata de mantener esta distribución (porque sino de forma aleatoria crearíamos dos datsets con distribuciones distintas en el target y eso implicaría que después no sabríamos si la diferencia de rendimiento en train y test es debido a diferencias en distribución o a memorización).
X_train.info(verbose=False)
<class 'pandas.core.frame.DataFrame'> Index: 61200 entries, 557936446145904693_1491870382 to 2804167227252516459_1480387919 Columns: 87 entries, visitNumber to totals.newVisits_1 dtypes: float64(2), int32(4), int64(8), uint8(73) memory usage: 10.3+ MB
X_test.info(verbose=False)
<class 'pandas.core.frame.DataFrame'> Index: 15300 entries, 262709141779758284_1477984469 to 4698747731138429443_1480729024 Columns: 87 entries, visitNumber to totals.newVisits_1 dtypes: float64(2), int32(4), int64(8), uint8(73) memory usage: 2.6+ MB
y_train.info(verbose=False)
<class 'pandas.core.frame.DataFrame'> Index: 61200 entries, 557936446145904693_1491870382 to 2804167227252516459_1480387919 Columns: 1 entries, totals.transactionRevenue_log to totals.transactionRevenue_log dtypes: float64(1) memory usage: 956.2+ KB
y_test.info(verbose=False)
<class 'pandas.core.frame.DataFrame'> Index: 15300 entries, 262709141779758284_1477984469 to 4698747731138429443_1480729024 Columns: 1 entries, totals.transactionRevenue_log to totals.transactionRevenue_log dtypes: float64(1) memory usage: 239.1+ KB
Vemos la distribución del target, que es la más crítica
y_train.describe()
| totals.transactionRevenue_log | |
|---|---|
| count | 61200.000000 |
| mean | 0.048309 |
| std | 0.460687 |
| min | 0.000000 |
| 25% | 0.000000 |
| 50% | 0.000000 |
| 75% | 0.000000 |
| max | 8.342721 |
y_test.describe()
| totals.transactionRevenue_log | |
|---|---|
| count | 15300.000000 |
| mean | 0.049866 |
| std | 0.466321 |
| min | 0.000000 |
| 25% | 0.000000 |
| 50% | 0.000000 |
| 75% | 0.000000 |
| max | 8.015360 |
Vemos que tienen medias y desviaciones parecidas, por lo tanto, a nivel de target, train y test tienen la misma distribución.
Modelización
1.- XGBoost --> Con DecisionTree
# Instanciamos el modelo
first_model = xgb.XGBRegressor(
booster = 'gbtree',
random_state = 42,
n_estimators = 100, # número de veces que quiero que vaya haciendo la reponderación de los errores --> 100 modelos --> 100 predicciones
max_depth = 6, # profundidad del decisiontree (número de preguntas que quiero hacer)
verbosity = 1 # impresión de mensajes --> 0 (silent), 1 (warning), 2 (info), and 3 (debug)
)
# El setup del Gradiant Boosting es prácticamente el mismo que el del RandomForest (tienen los mismos parámetros)
Lo normal es tirar 3-4 preguntas (a partir de 6-7 este algoritmo tiende a overfitear) e ir jugando con los estimadores en función del overfiting. Si hay overfiting, bajamos el número de estimadores. Si bajamos el número de estimadores y sigue dando overfiting, reducimos el número de preguntas.
%%time
first_model.fit(X_train, y_train) # entrenamos el modelo (con la versión logaritmica del train)
Wall time: 7.47 s
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
gamma=0, gpu_id=-1, importance_type=None,
interaction_constraints='', learning_rate=0.300000012,
max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=100, n_jobs=8,
num_parallel_tree=1, predictor='auto', random_state=42,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
tree_method='exact', validate_parameters=1, verbosity=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
gamma=0, gpu_id=-1, importance_type=None,
interaction_constraints='', learning_rate=0.300000012,
max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=100, n_jobs=8,
num_parallel_tree=1, predictor='auto', random_state=42,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
tree_method='exact', validate_parameters=1, verbosity=1)En clasificación teníamos tanto la predicción (0,1) como la probabilidad. En regresión tenemos la predicción directamente.
# Hago las predicciones (versión logaritmica del target)
test_predictions = pd.DataFrame(
first_model.predict(X_test), # hacemos un predict del X_test --> nos devuelve un array con las probabilidades (logarítmicas)
columns = ['Prediction'], # Le damos un nombre a la columna del dataframe
index = X_test.index # le damos el índice del X_test (el ID de las sesiones)
)
test_predictions.head(10)
| Prediction | |
|---|---|
| sessionId | |
| 262709141779758284_1477984469 | -0.001421 |
| 3905170756457668068_1495747064 | -0.001036 |
| 5616308294992765651_1490189234 | -0.011254 |
| 9888210956612447301_1480649299 | -0.001173 |
| 6010391620248091077_1471462746 | -0.000524 |
| 2976978272439784710_1485976864 | -0.001207 |
| 2086692138871491662_1488032986 | 0.001256 |
| 5778799106519831947_1489056775 | -0.002017 |
| 2355960437023363475_1487022281 | 0.000351 |
| 4184528169169510033_1471242040 | -0.001998 |
Comparamos la predicción con la realidad
# tabla de predicciones (y_predictions) y target real (y_test)
results_df = y_test.join(test_predictions)
results_df
| totals.transactionRevenue_log | Prediction | |
|---|---|---|
| sessionId | ||
| 262709141779758284_1477984469 | 0.0 | -0.001421 |
| 3905170756457668068_1495747064 | 0.0 | -0.001036 |
| 5616308294992765651_1490189234 | 0.0 | -0.011254 |
| 9888210956612447301_1480649299 | 0.0 | -0.001173 |
| 6010391620248091077_1471462746 | 0.0 | -0.000524 |
| ... | ... | ... |
| 7716257687992716386_1488387409 | 0.0 | -0.001842 |
| 1654457621776981165_1491998378 | 0.0 | -0.000007 |
| 7856658110453381610_1479704693 | 0.0 | -0.002104 |
| 9017936067833212713_1478216892 | 0.0 | 1.003912 |
| 4698747731138429443_1480729024 | 0.0 | -0.001398 |
15300 rows × 2 columns
Vemos que en una de las sesiones no se hace compra y sin embargo el algoritmo predice que sí. Esto es muy raro, porque lo normal es que sea al revés, que se haga compra y el algorimo diga que no, debido a que hay muchos más ceros que unos y por lo tanto el algoritmo se va a fijar más en los ceros. Lo vemos con más detenimiento.
Veamos qué ocurre en las predicciones de gente que se ha gastado mucho en la parte de test, para saber si está prediciendo bien o no
# ordenamos de más gasto a menos gasto
results_df.sort_values(by = target_log, ascending = False).head(10)
| totals.transactionRevenue_log | Prediction | |
|---|---|---|
| sessionId | ||
| 4759981878863963838_1490729055 | 8.015360 | 3.253612 |
| 3835134197841326255_1482419274 | 7.328437 | 1.013251 |
| 029534875378731789_1470583131 | 7.159292 | 1.156632 |
| 4471415710206918415_1494371651 | 6.962054 | -0.262163 |
| 9029794295932939024_1489662849 | 6.641626 | 5.173580 |
| 4471415710206918415_1478235992 | 6.633318 | -0.591810 |
| 4984366501121503466_1487605589 | 6.627988 | 2.962398 |
| 0432606793105704004_1485327772 | 6.555072 | 1.364958 |
| 1878787981465510145_1490909206 | 6.522387 | 0.278961 |
| 3152246617474456269_1472828860 | 6.455984 | 1.261454 |
Evaluamos el modelo
# Cambio el nombre de las columnas para entenderlo mejor
results_df.columns = ['Target', 'Prediction']
Utilizamos la métrica del Mean Square Error - MSE y del Root Mean Square Error - RMSE
# Calculamos el error
results_df['error'] = results_df['Target'] - results_df['Prediction']
results_df.head()
| Target | Prediction | error | |
|---|---|---|---|
| sessionId | |||
| 262709141779758284_1477984469 | 0.0 | -0.001421 | 0.001421 |
| 3905170756457668068_1495747064 | 0.0 | -0.001036 | 0.001036 |
| 5616308294992765651_1490189234 | 0.0 | -0.011254 | 0.011254 |
| 9888210956612447301_1480649299 | 0.0 | -0.001173 | 0.001173 |
| 6010391620248091077_1471462746 | 0.0 | -0.000524 | 0.000524 |
# Calculamos el Square Error --> Error al cuadrado
results_df['squared_error'] = results_df['error'] ** 2
# Calculamos el Root Square Error --> Raiz del error
results_df['rooted_squared_error'] = np.sqrt(results_df['squared_error'])
results_df.head()
| Target | Prediction | error | squared_error | rooted_squared_error | |
|---|---|---|---|---|---|
| sessionId | |||||
| 262709141779758284_1477984469 | 0.0 | -0.001421 | 0.001421 | 2.019578e-06 | 0.001421 |
| 3905170756457668068_1495747064 | 0.0 | -0.001036 | 0.001036 | 1.074217e-06 | 0.001036 |
| 5616308294992765651_1490189234 | 0.0 | -0.011254 | 0.011254 | 1.266599e-04 | 0.011254 |
| 9888210956612447301_1480649299 | 0.0 | -0.001173 | 0.001173 | 1.375736e-06 | 0.001173 |
| 6010391620248091077_1471462746 | 0.0 | -0.000524 | 0.000524 | 2.746968e-07 | 0.000524 |
# Ordenamos la tabla por el target
results_df.sort_values(by = 'Target', ascending=False).head()
| Target | Prediction | error | squared_error | rooted_squared_error | |
|---|---|---|---|---|---|
| sessionId | |||||
| 4759981878863963838_1490729055 | 8.015360 | 3.253612 | 4.761748 | 22.674247 | 4.761748 |
| 3835134197841326255_1482419274 | 7.328437 | 1.013251 | 6.315186 | 39.881573 | 6.315186 |
| 029534875378731789_1470583131 | 7.159292 | 1.156632 | 6.002660 | 36.031927 | 6.002660 |
| 4471415710206918415_1494371651 | 6.962054 | -0.262163 | 7.224217 | 52.189305 | 7.224217 |
| 9029794295932939024_1489662849 | 6.641626 | 5.173580 | 1.468046 | 2.155158 | 1.468046 |
#Evaluación de model mediante métricas
mse = results_df['squared_error'].mean() # mean square error
rmse = np.sqrt(mse) # root mean square error
print(f'MSE: {np.round(mse,4)} - RMSE: {np.round(rmse,4)}')
MSE: 0.1883 - RMSE: 0.434
# Comparamos el mse del modelo con el mse del modelo tonto
results_df['squared_error_modelo_tonto'] = results_df['Target']**2
mse_tonto = results_df['squared_error_modelo_tonto'].mean()
print(f'MSE: {np.round(mse,4)} y Benchmark_tonto: {np.round(mse_tonto,4)}')
MSE: 0.1883 y Benchmark_tonto: 0.2199
En los problemas de regresión, lo más importante es analizar dónde se producen los errores. Hay que mirar siempre la distribución de los errores
# Visualizamos la distribución de los errores
plt.figure(figsize = (15,5))
sns.distplot(
results_df['rooted_squared_error']
);
Hay muchos errores pequeños y unos pocos errores grandes. Esto significa que el modelo está prediciendo bien en la gente que no compra (como es lógico) y está prediciendo mal en la gente que compra. Por lo tanto, tengo que hacer que el algoritmo se fije más en la gente que compra para que aprenda a predecir bien en estos casos.
# Distribución del error de sólo aquellos que compran que es donde más se equivoca el algoritmo
plt.figure(figsize = (15,5))
sns.distplot(
results_df[results_df['Target'] > 0]['rooted_squared_error'],
fit=stats.norm # dibujamos la linea que representa la distribución normal
);
# Vemos la distribución del Prediction de sólo aquellos que compran que es donde más se equivoca el algoritmo
plt.figure(figsize=(15,5))
sns.distplot(
results_df[results_df['Target'] > 0]['Prediction'],
fit=stats.norm
);
# Vemos la distribución del target de sólo aquellos que compran que es donde más se equivoca el algoritmo
plt.figure(figsize=(15,5))
sns.distplot(
results_df[results_df['Target'] > 0]['Target'],
fit=stats.norm
);
Rebalanceamos el dataset
# Los que no compran del development
df_dev_zero = df_dev[df_dev[target_log] == 0]
# Los que compran del development
df_dev_nonzero = df_dev[df_dev[target_log] > 0]
df_dev_zero.info(verbose = False)
<class 'pandas.core.frame.DataFrame'> Index: 75594 entries, 1438082600262726746_1472803483 to 5123779100307500332_1483554750 Columns: 88 entries, visitNumber to totals.newVisits_1 dtypes: float64(3), int32(4), int64(8), uint8(73) memory usage: 13.3+ MB
df_dev_nonzero.info(verbose = False)
<class 'pandas.core.frame.DataFrame'> Index: 906 entries, 8885051388942907862_1472827393 to 4988517937139937145_1496230436 Columns: 88 entries, visitNumber to totals.newVisits_1 dtypes: float64(3), int32(4), int64(8), uint8(73) memory usage: 163.7+ KB
Tengo sólo 900 personas que compran frente a 75000 que no compran
Para rebalancear el dataset, hay que ir probando. Empezamos siendo más conservadores y si vemos que el error no mejora, somos más agresivos. Lo normal es que no tengan la misma proporción. Podemo usar como benchmark un 20-80, 15-85. Dependiendo de la cantidad de datos, las proporciones entre clase minoritaria y clase mayoritaria:
En nuestro caso, empezamos siendo conservadores y aplicamos un 25-75 --> Hay 3 veces más de gente que no compra (clase 0) que de gente que compra (clase 1)
# número de sesiones que compran del development
n_nonzeros = len(df_dev_nonzero)
n_nonzeros
906
# Vamos a hacer un undersampling con el número de sesiones que no compran de:
n_zeros = n_nonzeros * 3
n_zeros
2718
df.sample??
Signature: df.sample( n: 'int | None' = None, frac: 'float | None' = None, replace: 'bool_t' = False, weights=None, random_state: 'RandomState | None' = None, axis: 'Axis | None' = None, ignore_index: 'bool_t' = False, ) -> 'NDFrameT' Source: @final def sample( self: NDFrameT, n: int | None = None, frac: float | None = None, replace: bool_t = False, weights=None, random_state: RandomState | None = None, axis: Axis | None = None, ignore_index: bool_t = False, ) -> NDFrameT: """ Return a random sample of items from an axis of object. You can use `random_state` for reproducibility. Parameters ---------- n : int, optional Number of items from axis to return. Cannot be used with `frac`. Default = 1 if `frac` = None. frac : float, optional Fraction of axis items to return. Cannot be used with `n`. replace : bool, default False Allow or disallow sampling of the same row more than once. weights : str or ndarray-like, optional Default 'None' results in equal probability weighting. If passed a Series, will align with target object on index. Index values in weights not found in sampled object will be ignored and index values in sampled object not in weights will be assigned weights of zero. If called on a DataFrame, will accept the name of a column when axis = 0. Unless weights are a Series, weights must be same length as axis being sampled. If weights do not sum to 1, they will be normalized to sum to 1. Missing values in the weights column will be treated as zero. Infinite values not allowed. random_state : int, array-like, BitGenerator, np.random.RandomState, np.random.Generator, optional If int, array-like, or BitGenerator, seed for random number generator. If np.random.RandomState or np.random.Generator, use as given. .. versionchanged:: 1.1.0 array-like and BitGenerator object now passed to np.random.RandomState() as seed .. versionchanged:: 1.4.0 np.random.Generator objects now accepted axis : {0 or ‘index’, 1 or ‘columns’, None}, default None Axis to sample. Accepts axis number or name. Default is stat axis for given data type (0 for Series and DataFrames). ignore_index : bool, default False If True, the resulting index will be labeled 0, 1, …, n - 1. .. versionadded:: 1.3.0 Returns ------- Series or DataFrame A new object of same type as caller containing `n` items randomly sampled from the caller object. See Also -------- DataFrameGroupBy.sample: Generates random samples from each group of a DataFrame object. SeriesGroupBy.sample: Generates random samples from each group of a Series object. numpy.random.choice: Generates a random sample from a given 1-D numpy array. Notes ----- If `frac` > 1, `replacement` should be set to `True`. Examples -------- >>> df = pd.DataFrame({'num_legs': [2, 4, 8, 0], ... 'num_wings': [2, 0, 0, 0], ... 'num_specimen_seen': [10, 2, 1, 8]}, ... index=['falcon', 'dog', 'spider', 'fish']) >>> df num_legs num_wings num_specimen_seen falcon 2 2 10 dog 4 0 2 spider 8 0 1 fish 0 0 8 Extract 3 random elements from the ``Series`` ``df['num_legs']``: Note that we use `random_state` to ensure the reproducibility of the examples. >>> df['num_legs'].sample(n=3, random_state=1) fish 0 spider 8 falcon 2 Name: num_legs, dtype: int64 A random 50% sample of the ``DataFrame`` with replacement: >>> df.sample(frac=0.5, replace=True, random_state=1) num_legs num_wings num_specimen_seen dog 4 0 2 fish 0 0 8 An upsample sample of the ``DataFrame`` with replacement: Note that `replace` parameter has to be `True` for `frac` parameter > 1. >>> df.sample(frac=2, replace=True, random_state=1) num_legs num_wings num_specimen_seen dog 4 0 2 fish 0 0 8 falcon 2 2 10 falcon 2 2 10 fish 0 0 8 dog 4 0 2 fish 0 0 8 dog 4 0 2 Using a DataFrame column as weights. Rows with larger value in the `num_specimen_seen` column are more likely to be sampled. >>> df.sample(n=2, weights='num_specimen_seen', random_state=1) num_legs num_wings num_specimen_seen falcon 2 2 10 fish 0 0 8 """ # noqa:E501 if axis is None: axis = self._stat_axis_number axis = self._get_axis_number(axis) obj_len = self.shape[axis] # Process random_state argument rs = com.random_state(random_state) size = sample.process_sampling_size(n, frac, replace) if size is None: assert frac is not None size = round(frac * obj_len) if weights is not None: weights = sample.preprocess_weights(self, weights, axis) sampled_indices = sample.sample(obj_len, size, replace, weights, rs) result = self.take(sampled_indices, axis=axis) if ignore_index: result.index = default_index(len(result)) return result File: c:\users\jagui\anaconda3\lib\site-packages\pandas\core\generic.py Type: method
# Hago un sample aleatorio con el número de observaciones n_zero
df_dev_zero_sample = df_dev_zero.sample(
n = n_zeros, # número de observaciones
random_state = RANDOM_STATE # para que el random siempre sea el mismo
)
df_dev_zero_sample.info()
<class 'pandas.core.frame.DataFrame'> Index: 2718 entries, 316404407482592073_1473796330 to 1302015618664715049_1480965472 Data columns (total 88 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 visitNumber 2718 non-null int64 1 device.isMobile 2718 non-null int32 2 totals.hits 2718 non-null int32 3 totals.transactionRevenue_log 2718 non-null float64 4 year 2718 non-null int64 5 month 2718 non-null int64 6 monthDay 2718 non-null int64 7 weekDay 2718 non-null int64 8 quarter 2718 non-null int64 9 week 2718 non-null int64 10 visitHour 2718 non-null int64 11 visitNumber_log 2718 non-null float64 12 totals.hits_log 2718 non-null float64 13 network_net 2718 non-null int32 14 network_com 2718 non-null int32 15 channelGrouping_(Other) 2718 non-null uint8 16 channelGrouping_Affiliates 2718 non-null uint8 17 channelGrouping_Direct 2718 non-null uint8 18 channelGrouping_Display 2718 non-null uint8 19 channelGrouping_Organic Search 2718 non-null uint8 20 channelGrouping_Paid Search 2718 non-null uint8 21 channelGrouping_Referral 2718 non-null uint8 22 channelGrouping_Social 2718 non-null uint8 23 device.browser_Chrome 2718 non-null uint8 24 device.browser_Safari 2718 non-null uint8 25 device.browser_Firefox 2718 non-null uint8 26 device.browser_Internet Explorer 2718 non-null uint8 27 device.browser_Edge 2718 non-null uint8 28 device.browser_Others 2718 non-null uint8 29 device.operatingSystem_Windows 2718 non-null uint8 30 device.operatingSystem_Macintosh 2718 non-null uint8 31 device.operatingSystem_Android 2718 non-null uint8 32 device.operatingSystem_iOS 2718 non-null uint8 33 device.operatingSystem_Linux 2718 non-null uint8 34 device.operatingSystem_Chrome OS 2718 non-null uint8 35 device.operatingSystem_Others 2718 non-null uint8 36 device.deviceCategory_desktop 2718 non-null uint8 37 device.deviceCategory_mobile 2718 non-null uint8 38 device.deviceCategory_tablet 2718 non-null uint8 39 geoNetwork.continent_(not set) 2718 non-null uint8 40 geoNetwork.continent_Africa 2718 non-null uint8 41 geoNetwork.continent_Americas 2718 non-null uint8 42 geoNetwork.continent_Asia 2718 non-null uint8 43 geoNetwork.continent_Europe 2718 non-null uint8 44 geoNetwork.continent_Oceania 2718 non-null uint8 45 geoNetwork.subContinent_(not set) 2718 non-null uint8 46 geoNetwork.subContinent_Australasia 2718 non-null uint8 47 geoNetwork.subContinent_Caribbean 2718 non-null uint8 48 geoNetwork.subContinent_Central America 2718 non-null uint8 49 geoNetwork.subContinent_Central Asia 2718 non-null uint8 50 geoNetwork.subContinent_Eastern Africa 2718 non-null uint8 51 geoNetwork.subContinent_Eastern Asia 2718 non-null uint8 52 geoNetwork.subContinent_Eastern Europe 2718 non-null uint8 53 geoNetwork.subContinent_Melanesia 2718 non-null uint8 54 geoNetwork.subContinent_Micronesian Region 2718 non-null uint8 55 geoNetwork.subContinent_Middle Africa 2718 non-null uint8 56 geoNetwork.subContinent_Northern Africa 2718 non-null uint8 57 geoNetwork.subContinent_Northern America 2718 non-null uint8 58 geoNetwork.subContinent_Northern Europe 2718 non-null uint8 59 geoNetwork.subContinent_Polynesia 2718 non-null uint8 60 geoNetwork.subContinent_South America 2718 non-null uint8 61 geoNetwork.subContinent_Southeast Asia 2718 non-null uint8 62 geoNetwork.subContinent_Southern Africa 2718 non-null uint8 63 geoNetwork.subContinent_Southern Asia 2718 non-null uint8 64 geoNetwork.subContinent_Southern Europe 2718 non-null uint8 65 geoNetwork.subContinent_Western Africa 2718 non-null uint8 66 geoNetwork.subContinent_Western Asia 2718 non-null uint8 67 geoNetwork.subContinent_Western Europe 2718 non-null uint8 68 geoNetwork.country_Romania 2718 non-null uint8 69 geoNetwork.country_Canada 2718 non-null uint8 70 geoNetwork.country_Brazil 2718 non-null uint8 71 geoNetwork.country_Italy 2718 non-null uint8 72 geoNetwork.country_Mexico 2718 non-null uint8 73 geoNetwork.country_Chile 2718 non-null uint8 74 geoNetwork.country_South Korea 2718 non-null uint8 75 geoNetwork.country_Ireland 2718 non-null uint8 76 geoNetwork.country_Germany 2718 non-null uint8 77 geoNetwork.country_Peru 2718 non-null uint8 78 geoNetwork.country_Others 2718 non-null uint8 79 geoNetwork.city_(not set) 2718 non-null uint8 80 geoNetwork.city_Mexico City 2718 non-null uint8 81 geoNetwork.city_Toronto 2718 non-null uint8 82 geoNetwork.city_Dallas 2718 non-null uint8 83 geoNetwork.city_Dublin 2718 non-null uint8 84 geoNetwork.city_La Victoria 2718 non-null uint8 85 geoNetwork.city_Others 2718 non-null uint8 86 totals.bounces_1 2718 non-null uint8 87 totals.newVisits_1 2718 non-null uint8 dtypes: float64(3), int32(4), int64(8), uint8(73) memory usage: 491.0+ KB
# unimos los ceros y los no ceros para crear el nuevo development rebalanceado
df_dev_sample = pd.concat([df_dev_zero_sample, df_dev_nonzero])
df_dev_sample.info(verbose=False)
<class 'pandas.core.frame.DataFrame'> Index: 3624 entries, 316404407482592073_1473796330 to 4988517937139937145_1496230436 Columns: 88 entries, visitNumber to totals.newVisits_1 dtypes: float64(3), int32(4), int64(8), uint8(73) memory usage: 654.7+ KB
Volvemos a repetir todos los pasos anteriores (se podría hacer una función que haga todos estos pasos)
# separamos atributos y target sin aplicar inplace = True
df_dev_sample_X = df_dev_sample.drop(target_log, axis=1) # atributos
df_dev_sample_y = df_dev_sample[[target_log]] # target
# train/test split
X_train_sample, X_test_sample, y_train_sample, y_test_sample = model_selection.train_test_split(
df_dev_sample_X, # X
df_dev_sample_y, # y
random_state = RANDOM_STATE,
test_size = 0.3 # tamaño del test (lo ponemos mayor al anterior porque ahora tengo un datset mucho más pequeño, para tener un mínimo de 1000 observaciones en test)
)
X_train_sample.info(verbose=False)
<class 'pandas.core.frame.DataFrame'> Index: 2536 entries, 4501448098783068461_1493695160 to 4386324166188241612_1480961192 Columns: 87 entries, visitNumber to totals.newVisits_1 dtypes: float64(2), int32(4), int64(8), uint8(73) memory usage: 438.4+ KB
y_train_sample.info(verbose=False)
<class 'pandas.core.frame.DataFrame'> Index: 2536 entries, 4501448098783068461_1493695160 to 4386324166188241612_1480961192 Columns: 1 entries, totals.transactionRevenue_log to totals.transactionRevenue_log dtypes: float64(1) memory usage: 39.6+ KB
X_test_sample.info(verbose=False)
<class 'pandas.core.frame.DataFrame'> Index: 1088 entries, 5059380332023098976_1486429189 to 7187268430836970062_1472604143 Columns: 87 entries, visitNumber to totals.newVisits_1 dtypes: float64(2), int32(4), int64(8), uint8(73) memory usage: 188.1+ KB
y_test_sample.info(verbose=False)
<class 'pandas.core.frame.DataFrame'> Index: 1088 entries, 5059380332023098976_1486429189 to 7187268430836970062_1472604143 Columns: 1 entries, totals.transactionRevenue_log to totals.transactionRevenue_log dtypes: float64(1) memory usage: 17.0+ KB
Comprobamos que tengan la misma distribución. Lo hacemos para el target que es el más crítico
y_train_sample.describe()
| totals.transactionRevenue_log | |
|---|---|
| count | 2536.000000 |
| mean | 1.027441 |
| std | 1.881846 |
| min | 0.000000 |
| 25% | 0.000000 |
| 50% | 0.000000 |
| 75% | 0.000000 |
| max | 8.342721 |
y_test_sample.describe()
| totals.transactionRevenue_log | |
|---|---|
| count | 1088.000000 |
| mean | 1.023771 |
| std | 1.845300 |
| min | 0.000000 |
| 25% | 0.000000 |
| 50% | 0.000000 |
| 75% | 1.674320 |
| max | 8.015360 |
Como la media y la desviación típica son muy parecidas, van a tener distribuciones muy parecidas. Por lo tanto, si el modelo se comporta bien para el train pero no para el test podemos decir que está memorizando.
# Instanciamos el modelo (con el mismo setup que el anterior)
resampled_model = xgb.XGBRegressor(
booster = 'gbtree',
random_state = RANDOM_STATE,
n_estimators = 100, # número de veces que quiero que vaya haciendo la reponderación de los errores --> 100 modelos --> 100 predicciones
max_depth = 6, # profundidad del decisiontree (número de preguntas que quiero hacer)
verbosity = 1 # impresión de mensajes --> 0 (silent), 1 (warning), 2 (info), and 3 (debug)
)
# El setup del Gradiant Boosting es prácticamente el mismo que el del RandomForest (tienen los mismos parámetros)
%%time
resampled_model.fit(X_train_sample, y_train_sample) # entreno el modelo
Wall time: 379 ms
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
gamma=0, gpu_id=-1, importance_type=None,
interaction_constraints='', learning_rate=0.300000012,
max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=100, n_jobs=8,
num_parallel_tree=1, predictor='auto', random_state=42,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
tree_method='exact', validate_parameters=1, verbosity=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
gamma=0, gpu_id=-1, importance_type=None,
interaction_constraints='', learning_rate=0.300000012,
max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=100, n_jobs=8,
num_parallel_tree=1, predictor='auto', random_state=42,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
tree_method='exact', validate_parameters=1, verbosity=1)# Hacemos la predicción
test_predictions_sampled = pd.DataFrame(
resampled_model.predict(X_test_sample),
index = X_test_sample.index,
columns = ['Prediction_Sample'])
test_predictions_sampled.head(10)
| Prediction_Sample | |
|---|---|
| sessionId | |
| 5059380332023098976_1486429189 | 0.002766 |
| 2833030312056761790_1490386591 | 0.206795 |
| 5415086779040891341_1492124532 | 2.590003 |
| 580589631805288488_1481695487 | 0.139778 |
| 3068571312311710401_1487883929 | 0.106554 |
| 5136915182473070768_1488107794 | -0.014225 |
| 5103624798428789625_1479139535 | 0.022946 |
| 4871930547578771188_1490793874 | 0.013411 |
| 4400040492653151202_1493238023 | 0.016176 |
| 1556829665552852663_1486140782 | -0.021558 |
# juntamos el target y las predicciones del sample
results_df_sample = y_test_sample.join(test_predictions_sampled)
results_df_sample.head()
| totals.transactionRevenue_log | Prediction_Sample | |
|---|---|---|
| sessionId | ||
| 5059380332023098976_1486429189 | 0.000000 | 0.002766 |
| 2833030312056761790_1490386591 | 0.000000 | 0.206795 |
| 5415086779040891341_1492124532 | 2.707383 | 2.590003 |
| 580589631805288488_1481695487 | 0.000000 | 0.139778 |
| 3068571312311710401_1487883929 | 0.000000 | 0.106554 |
results_df_sample.columns = ['Target_Sample', 'Prediction_Sample']
results_df_sample.head()
| Target_Sample | Prediction_Sample | |
|---|---|---|
| sessionId | ||
| 5059380332023098976_1486429189 | 0.000000 | 0.002766 |
| 2833030312056761790_1490386591 | 0.000000 | 0.206795 |
| 5415086779040891341_1492124532 | 2.707383 | 2.590003 |
| 580589631805288488_1481695487 | 0.000000 | 0.139778 |
| 3068571312311710401_1487883929 | 0.000000 | 0.106554 |
results_df_sample.sort_values(by = 'Target_Sample', ascending = False).head(10)
| Target_Sample | Prediction_Sample | |
|---|---|---|
| sessionId | ||
| 4759981878863963838_1490729055 | 8.015360 | 6.854543 |
| 5283278344963059120_1471737021 | 7.279850 | 3.291704 |
| 7107718938057621237_1473715800 | 7.081297 | 5.267385 |
| 7261302704200810675_1487644175 | 6.883042 | 5.800747 |
| 4863941202505455588_1494009960 | 6.685112 | 4.070213 |
| 4471415710206918415_1478235992 | 6.633318 | 3.268886 |
| 0432606793105704004_1485327772 | 6.555072 | 3.947776 |
| 5087617268001908286_1478632325 | 6.405212 | 4.715690 |
| 253776212101014141_1488832562 | 6.398379 | 3.618072 |
| 0634506282991036601_1493748688 | 6.387300 | 5.992160 |
Evaluación de resampled_model mediante métricas
results_df_sample['error_sample'] = results_df_sample['Target_Sample'] - results_df_sample['Prediction_Sample']
results_df_sample['squared_error_sample'] = results_df_sample['error_sample'] ** 2
results_df_sample['rooted_squared_error_sample'] = np.sqrt(results_df_sample['squared_error_sample'])
mse_sample = results_df_sample['squared_error_sample'].mean()
rmse_sample = np.sqrt(mse_sample)
print(f'MSE_Sample: {np.round(mse_sample,4)} - RMSE_Sample: {np.round(rmse_sample,4)}')
MSE_Sample: 1.0314 - RMSE_Sample: 1.0156
Comparamos los resultados entre el primer modelo y el modelo rebalanceado
# Primer modelo
print(f'MSE: {np.round(mse,4)} - RMSE: {np.round(rmse,4)}')
MSE: 0.1883 - RMSE: 0.434
# Modelo sample
print(f'MSE_Sample: {np.round(mse_sample,4)} - RMSE_Sample: {np.round(rmse_sample,4)}')
MSE_Sample: 1.0314 - RMSE_Sample: 1.0156
# Comparamos el mse del modelo sample con el mse del modelo tonto
results_df_sample['squared_error_modelo_tonto_sample'] = results_df_sample['Target_Sample']**2
mse_tonto_sample = results_df_sample['squared_error_modelo_tonto_sample'].mean()
print(f'MSE_Sample: {np.round(mse_sample,4)} y Benchmark_tonto_Sample: {np.round(mse_tonto_sample,4)}')
MSE_Sample: 1.0314 y Benchmark_tonto_Sample: 4.4501
Como no puedo comparar las métricas entre los dos módelos por ser distribuciones diferentes y tampoco lo puedo comparar con el modelo tonto, ¿cómo puedo saber si el modelo es bueno?
# distribución del error del modelo sample de la gente que compra
plt.figure(figsize=(15,5))
sns.distplot(
results_df_sample[results_df_sample['Target_Sample'] > 0]['rooted_squared_error_sample'],
fit=stats.norm
);
Antes teníamos una media de los errores entorno al 3, ahora la tenemos entorno al 1. Ahora la media del error entre la gente que me compra es muchísimo menor. Por lo tanto el modelo rebalanceado va a funcionar mucho mejor que el anterior.
# Distribución del Prediction_Sample del modelo sample de la gente que compra
plt.figure(figsize=(15,5))
sns.distplot(
results_df_sample[results_df_sample['Target_Sample'] > 0]['Prediction_Sample'],
fit=stats.norm
);
# Distribución del target del modelo sample de la gente que compra
plt.figure(figsize=(15,5))
sns.distplot(
results_df_sample[results_df_sample['Target_Sample'] > 0]['Target_Sample'],
fit=stats.norm
);
La distribución del target sigue siendo exactamente la misma, ya que no he hecho oversampling (he mantenido exactamente la misma gente que me compraba). Mientras que la distribución de la predicción ya se parece mucho más a la del target.
Validamos el modelo rebalanceado
# Predicciones del validation split
val_predictions = pd.DataFrame(
resampled_model.predict(df_val_X), # Atributos de validación --> nos da un array
index = df_val_X.index, # Target de validación
columns = ['Prediction'])
# Unimos el target y las predicciones del validation split
val_results_df = df_val_y.join(val_predictions)
val_results_df.head(10)
| totals.transactionRevenue_log | Prediction | |
|---|---|---|
| sessionId | ||
| 0000572434142265465_1500605115 | 0.0 | -0.051824 |
| 0001285462512259769_1499086856 | 0.0 | 0.038284 |
| 0001399220722808057_1500076255 | 0.0 | 0.007529 |
| 0001527863526384268_1498836434 | 0.0 | -0.027790 |
| 0001563469934876372_1498960250 | 0.0 | 0.133938 |
| 0001674026512373272_1499210304 | 0.0 | 0.426844 |
| 000170187170673177_1497108503 | 0.0 | -0.255489 |
| 000170187170673177_1497480138 | 0.0 | 0.104530 |
| 0004912455764059748_1498617507 | 0.0 | 2.069660 |
| 0004946235279474055_1498961332 | 0.0 | -0.011097 |
# Renombro las columnas
val_results_df.columns = ['Target', 'Prediction']
# Validamos con las métricas
val_results_df['error'] = val_results_df['Target'] - val_results_df['Prediction']
val_results_df['squared_error'] = val_results_df['error'] ** 2
val_results_df['rooted_squared_error'] = np.sqrt(val_results_df['squared_error'])
mse_val = val_results_df['squared_error'].mean()
rmse_val = np.sqrt(mse_val)
print(f'MSE_Val: {np.round(mse_val,4)} - RMSE_Val: {np.round(rmse_val,4)}')
MSE_Val: 0.669 - RMSE_Val: 0.8179
# distribución del error de la gente que compra en validacion
plt.figure(figsize=(15,5))
sns.distplot(
val_results_df[val_results_df['Target'] > 0]['rooted_squared_error'],
fit=stats.norm
);
Comprobación del overfiting
# Instanciamos el modelo (con el mismo setup que el anterior)
resampled_model_rmse = xgb.XGBRegressor(
booster = 'gbtree',
eval_metric = "rmse", # métrica de evaluación
random_state = RANDOM_STATE,
n_estimators = 100, # número de veces que quiero que vaya haciendo la reponderación de los errores --> 100 modelos --> 100 predicciones
max_depth = 6, # profundidad del decisiontree (número de preguntas que quiero hacer)
verbosity = 1 # impresión de mensajes --> 0 (silent), 1 (warning), 2 (info), and 3 (debug)
)
# El setup del Gradiant Boosting es prácticamente el mismo que el del RandomForest (tienen los mismos parámetros)
%%time
resampled_model_rmse.fit( # entreno el modelo
X_train_sample, y_train_sample,
eval_set = [(X_train_sample, y_train_sample), (X_test_sample, y_test_sample)],
verbose = True
)
[0] validation_0-rmse:1.51154 validation_1-rmse:1.52228 [1] validation_0-rmse:1.22818 validation_1-rmse:1.27740 [2] validation_0-rmse:1.04765 validation_1-rmse:1.12463 [3] validation_0-rmse:0.92304 validation_1-rmse:1.04240 [4] validation_0-rmse:0.84414 validation_1-rmse:1.00107 [5] validation_0-rmse:0.79519 validation_1-rmse:0.98262 [6] validation_0-rmse:0.74280 validation_1-rmse:0.97027 [7] validation_0-rmse:0.70683 validation_1-rmse:0.96940 [8] validation_0-rmse:0.68534 validation_1-rmse:0.96700 [9] validation_0-rmse:0.64970 validation_1-rmse:0.97128 [10] validation_0-rmse:0.62789 validation_1-rmse:0.96574 [11] validation_0-rmse:0.59806 validation_1-rmse:0.96946 [12] validation_0-rmse:0.58593 validation_1-rmse:0.97178 [13] validation_0-rmse:0.57629 validation_1-rmse:0.97280 [14] validation_0-rmse:0.57110 validation_1-rmse:0.97492 [15] validation_0-rmse:0.54012 validation_1-rmse:0.97604 [16] validation_0-rmse:0.53307 validation_1-rmse:0.97687 [17] validation_0-rmse:0.52838 validation_1-rmse:0.97866 [18] validation_0-rmse:0.52151 validation_1-rmse:0.97916 [19] validation_0-rmse:0.50357 validation_1-rmse:0.98058 [20] validation_0-rmse:0.49150 validation_1-rmse:0.98109 [21] validation_0-rmse:0.47581 validation_1-rmse:0.98218 [22] validation_0-rmse:0.47247 validation_1-rmse:0.98238 [23] validation_0-rmse:0.46288 validation_1-rmse:0.98317 [24] validation_0-rmse:0.45526 validation_1-rmse:0.98503 [25] validation_0-rmse:0.45336 validation_1-rmse:0.98616 [26] validation_0-rmse:0.43852 validation_1-rmse:0.98577 [27] validation_0-rmse:0.42884 validation_1-rmse:0.98718 [28] validation_0-rmse:0.42683 validation_1-rmse:0.98760 [29] validation_0-rmse:0.42626 validation_1-rmse:0.98750 [30] validation_0-rmse:0.42186 validation_1-rmse:0.99016 [31] validation_0-rmse:0.41553 validation_1-rmse:0.99131 [32] validation_0-rmse:0.40480 validation_1-rmse:0.99196 [33] validation_0-rmse:0.40399 validation_1-rmse:0.99236 [34] validation_0-rmse:0.39917 validation_1-rmse:0.99339 [35] validation_0-rmse:0.39270 validation_1-rmse:0.99154 [36] validation_0-rmse:0.37754 validation_1-rmse:0.99199 [37] validation_0-rmse:0.37473 validation_1-rmse:0.99149 [38] validation_0-rmse:0.36582 validation_1-rmse:0.99243 [39] validation_0-rmse:0.35725 validation_1-rmse:0.99588 [40] validation_0-rmse:0.34814 validation_1-rmse:0.99657 [41] validation_0-rmse:0.34676 validation_1-rmse:0.99750 [42] validation_0-rmse:0.34440 validation_1-rmse:0.99684 [43] validation_0-rmse:0.34382 validation_1-rmse:0.99740 [44] validation_0-rmse:0.33255 validation_1-rmse:0.99782 [45] validation_0-rmse:0.32162 validation_1-rmse:0.99891 [46] validation_0-rmse:0.31556 validation_1-rmse:0.99929 [47] validation_0-rmse:0.31250 validation_1-rmse:1.00028 [48] validation_0-rmse:0.30387 validation_1-rmse:0.99926 [49] validation_0-rmse:0.29132 validation_1-rmse:1.00171 [50] validation_0-rmse:0.28241 validation_1-rmse:1.00352 [51] validation_0-rmse:0.27982 validation_1-rmse:1.00439 [52] validation_0-rmse:0.26997 validation_1-rmse:1.00473 [53] validation_0-rmse:0.26220 validation_1-rmse:1.00562 [54] validation_0-rmse:0.25855 validation_1-rmse:1.00540 [55] validation_0-rmse:0.25535 validation_1-rmse:1.00569 [56] validation_0-rmse:0.24939 validation_1-rmse:1.00604 [57] validation_0-rmse:0.24330 validation_1-rmse:1.00669 [58] validation_0-rmse:0.23850 validation_1-rmse:1.00683 [59] validation_0-rmse:0.23560 validation_1-rmse:1.00758 [60] validation_0-rmse:0.23516 validation_1-rmse:1.00782 [61] validation_0-rmse:0.22840 validation_1-rmse:1.00778 [62] validation_0-rmse:0.22187 validation_1-rmse:1.00862 [63] validation_0-rmse:0.21796 validation_1-rmse:1.00801 [64] validation_0-rmse:0.21623 validation_1-rmse:1.00811 [65] validation_0-rmse:0.21363 validation_1-rmse:1.00767 [66] validation_0-rmse:0.21213 validation_1-rmse:1.00885 [67] validation_0-rmse:0.20685 validation_1-rmse:1.00999 [68] validation_0-rmse:0.20100 validation_1-rmse:1.01043 [69] validation_0-rmse:0.19768 validation_1-rmse:1.00990 [70] validation_0-rmse:0.19726 validation_1-rmse:1.00976 [71] validation_0-rmse:0.19535 validation_1-rmse:1.01083 [72] validation_0-rmse:0.19277 validation_1-rmse:1.00975 [73] validation_0-rmse:0.18760 validation_1-rmse:1.00912 [74] validation_0-rmse:0.18460 validation_1-rmse:1.00973 [75] validation_0-rmse:0.18332 validation_1-rmse:1.00966 [76] validation_0-rmse:0.18079 validation_1-rmse:1.01081 [77] validation_0-rmse:0.17791 validation_1-rmse:1.01065 [78] validation_0-rmse:0.17722 validation_1-rmse:1.01108 [79] validation_0-rmse:0.17299 validation_1-rmse:1.01189 [80] validation_0-rmse:0.17111 validation_1-rmse:1.01168 [81] validation_0-rmse:0.17049 validation_1-rmse:1.01195 [82] validation_0-rmse:0.16890 validation_1-rmse:1.01293 [83] validation_0-rmse:0.16032 validation_1-rmse:1.01355 [84] validation_0-rmse:0.15821 validation_1-rmse:1.01357 [85] validation_0-rmse:0.15623 validation_1-rmse:1.01343 [86] validation_0-rmse:0.15341 validation_1-rmse:1.01304 [87] validation_0-rmse:0.15221 validation_1-rmse:1.01304 [88] validation_0-rmse:0.14833 validation_1-rmse:1.01375 [89] validation_0-rmse:0.14499 validation_1-rmse:1.01400 [90] validation_0-rmse:0.14489 validation_1-rmse:1.01397 [91] validation_0-rmse:0.14478 validation_1-rmse:1.01382 [92] validation_0-rmse:0.14472 validation_1-rmse:1.01371 [93] validation_0-rmse:0.14246 validation_1-rmse:1.01376 [94] validation_0-rmse:0.14002 validation_1-rmse:1.01364 [95] validation_0-rmse:0.13801 validation_1-rmse:1.01431 [96] validation_0-rmse:0.13512 validation_1-rmse:1.01435 [97] validation_0-rmse:0.13065 validation_1-rmse:1.01531 [98] validation_0-rmse:0.12940 validation_1-rmse:1.01543 [99] validation_0-rmse:0.12925 validation_1-rmse:1.01559 Wall time: 586 ms
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
eval_metric='rmse', gamma=0, gpu_id=-1, importance_type=None,
interaction_constraints='', learning_rate=0.300000012,
max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=100, n_jobs=8,
num_parallel_tree=1, predictor='auto', random_state=42,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
tree_method='exact', validate_parameters=1, verbosity=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
eval_metric='rmse', gamma=0, gpu_id=-1, importance_type=None,
interaction_constraints='', learning_rate=0.300000012,
max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=100, n_jobs=8,
num_parallel_tree=1, predictor='auto', random_state=42,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
tree_method='exact', validate_parameters=1, verbosity=1)Vemos que el modelo ha memorizado, independientemente del número de estimadores que se utilicen, por lo que tenemos que reducir el número de preguntas que hace. Le daremos una profundidad de 3.
# Instanciamos el modelo (con el mismo setup que el anterior)
resampled_model_rmse = xgb.XGBRegressor(
booster = 'gbtree',
eval_metric = "rmse", # métrica de evaluación
random_state = RANDOM_STATE,
n_estimators = 100, # número de veces que quiero que vaya haciendo la reponderación de los errores --> 100 modelos --> 100 predicciones
max_depth = 3, # profundidad del decisiontree (número de preguntas que quiero hacer)
verbosity = 1 # impresión de mensajes --> 0 (silent), 1 (warning), 2 (info), and 3 (debug)
)
# El setup del Gradiant Boosting es prácticamente el mismo que el del RandomForest (tienen los mismos parámetros)
%%time
resampled_model_rmse.fit( # entreno el modelo
X_train_sample, y_train_sample,
eval_set = [(X_train_sample, y_train_sample), (X_test_sample, y_test_sample)],
verbose = True
)
[0] validation_0-rmse:1.55929 validation_1-rmse:1.52702 [1] validation_0-rmse:1.31226 validation_1-rmse:1.27931 [2] validation_0-rmse:1.16041 validation_1-rmse:1.13422 [3] validation_0-rmse:1.07151 validation_1-rmse:1.05292 [4] validation_0-rmse:1.01379 validation_1-rmse:1.00520 [5] validation_0-rmse:0.98028 validation_1-rmse:0.97473 [6] validation_0-rmse:0.95907 validation_1-rmse:0.96161 [7] validation_0-rmse:0.94442 validation_1-rmse:0.95102 [8] validation_0-rmse:0.93175 validation_1-rmse:0.94612 [9] validation_0-rmse:0.92052 validation_1-rmse:0.93975 [10] validation_0-rmse:0.91180 validation_1-rmse:0.93985 [11] validation_0-rmse:0.90527 validation_1-rmse:0.93814 [12] validation_0-rmse:0.89767 validation_1-rmse:0.93529 [13] validation_0-rmse:0.88963 validation_1-rmse:0.93417 [14] validation_0-rmse:0.88485 validation_1-rmse:0.93299 [15] validation_0-rmse:0.87971 validation_1-rmse:0.92858 [16] validation_0-rmse:0.87411 validation_1-rmse:0.93035 [17] validation_0-rmse:0.86858 validation_1-rmse:0.93030 [18] validation_0-rmse:0.86163 validation_1-rmse:0.92961 [19] validation_0-rmse:0.85770 validation_1-rmse:0.93429 [20] validation_0-rmse:0.85378 validation_1-rmse:0.93326 [21] validation_0-rmse:0.85089 validation_1-rmse:0.93275 [22] validation_0-rmse:0.84695 validation_1-rmse:0.93173 [23] validation_0-rmse:0.84408 validation_1-rmse:0.93412 [24] validation_0-rmse:0.83925 validation_1-rmse:0.93285 [25] validation_0-rmse:0.83633 validation_1-rmse:0.93452 [26] validation_0-rmse:0.83457 validation_1-rmse:0.93299 [27] validation_0-rmse:0.83323 validation_1-rmse:0.93276 [28] validation_0-rmse:0.82998 validation_1-rmse:0.93339 [29] validation_0-rmse:0.82746 validation_1-rmse:0.93724 [30] validation_0-rmse:0.82667 validation_1-rmse:0.93703 [31] validation_0-rmse:0.82525 validation_1-rmse:0.93701 [32] validation_0-rmse:0.82182 validation_1-rmse:0.93780 [33] validation_0-rmse:0.82049 validation_1-rmse:0.93897 [34] validation_0-rmse:0.81645 validation_1-rmse:0.94055 [35] validation_0-rmse:0.81273 validation_1-rmse:0.93765 [36] validation_0-rmse:0.81138 validation_1-rmse:0.93739 [37] validation_0-rmse:0.80864 validation_1-rmse:0.93570 [38] validation_0-rmse:0.80660 validation_1-rmse:0.93619 [39] validation_0-rmse:0.80425 validation_1-rmse:0.93835 [40] validation_0-rmse:0.79969 validation_1-rmse:0.93600 [41] validation_0-rmse:0.79824 validation_1-rmse:0.93601 [42] validation_0-rmse:0.79524 validation_1-rmse:0.93669 [43] validation_0-rmse:0.79313 validation_1-rmse:0.93812 [44] validation_0-rmse:0.79226 validation_1-rmse:0.93843 [45] validation_0-rmse:0.79122 validation_1-rmse:0.93876 [46] validation_0-rmse:0.79045 validation_1-rmse:0.93831 [47] validation_0-rmse:0.78816 validation_1-rmse:0.93840 [48] validation_0-rmse:0.78535 validation_1-rmse:0.93845 [49] validation_0-rmse:0.78175 validation_1-rmse:0.93909 [50] validation_0-rmse:0.77844 validation_1-rmse:0.94178 [51] validation_0-rmse:0.77556 validation_1-rmse:0.94363 [52] validation_0-rmse:0.77306 validation_1-rmse:0.94303 [53] validation_0-rmse:0.77054 validation_1-rmse:0.94572 [54] validation_0-rmse:0.76766 validation_1-rmse:0.94596 [55] validation_0-rmse:0.76508 validation_1-rmse:0.94669 [56] validation_0-rmse:0.76146 validation_1-rmse:0.94666 [57] validation_0-rmse:0.76081 validation_1-rmse:0.94697 [58] validation_0-rmse:0.75774 validation_1-rmse:0.94633 [59] validation_0-rmse:0.75726 validation_1-rmse:0.94681 [60] validation_0-rmse:0.75641 validation_1-rmse:0.94702 [61] validation_0-rmse:0.75609 validation_1-rmse:0.94745 [62] validation_0-rmse:0.75367 validation_1-rmse:0.94889 [63] validation_0-rmse:0.75308 validation_1-rmse:0.95005 [64] validation_0-rmse:0.75161 validation_1-rmse:0.95086 [65] validation_0-rmse:0.75112 validation_1-rmse:0.95071 [66] validation_0-rmse:0.74942 validation_1-rmse:0.94915 [67] validation_0-rmse:0.74542 validation_1-rmse:0.95051 [68] validation_0-rmse:0.74307 validation_1-rmse:0.94983 [69] validation_0-rmse:0.73894 validation_1-rmse:0.94996 [70] validation_0-rmse:0.73817 validation_1-rmse:0.95022 [71] validation_0-rmse:0.73505 validation_1-rmse:0.95040 [72] validation_0-rmse:0.73426 validation_1-rmse:0.94891 [73] validation_0-rmse:0.73236 validation_1-rmse:0.94895 [74] validation_0-rmse:0.72956 validation_1-rmse:0.94945 [75] validation_0-rmse:0.72837 validation_1-rmse:0.95000 [76] validation_0-rmse:0.72770 validation_1-rmse:0.95017 [77] validation_0-rmse:0.72548 validation_1-rmse:0.95176 [78] validation_0-rmse:0.72497 validation_1-rmse:0.95218 [79] validation_0-rmse:0.72384 validation_1-rmse:0.95240 [80] validation_0-rmse:0.72152 validation_1-rmse:0.95243 [81] validation_0-rmse:0.71924 validation_1-rmse:0.95278 [82] validation_0-rmse:0.71869 validation_1-rmse:0.95313 [83] validation_0-rmse:0.71649 validation_1-rmse:0.95288 [84] validation_0-rmse:0.71379 validation_1-rmse:0.95401 [85] validation_0-rmse:0.71298 validation_1-rmse:0.95284 [86] validation_0-rmse:0.71224 validation_1-rmse:0.95251 [87] validation_0-rmse:0.71074 validation_1-rmse:0.95339 [88] validation_0-rmse:0.70989 validation_1-rmse:0.95413 [89] validation_0-rmse:0.70942 validation_1-rmse:0.95435 [90] validation_0-rmse:0.70815 validation_1-rmse:0.95441 [91] validation_0-rmse:0.70752 validation_1-rmse:0.95401 [92] validation_0-rmse:0.70727 validation_1-rmse:0.95426 [93] validation_0-rmse:0.70563 validation_1-rmse:0.95550 [94] validation_0-rmse:0.70532 validation_1-rmse:0.95678 [95] validation_0-rmse:0.70376 validation_1-rmse:0.95694 [96] validation_0-rmse:0.70190 validation_1-rmse:0.95843 [97] validation_0-rmse:0.70052 validation_1-rmse:0.95900 [98] validation_0-rmse:0.69996 validation_1-rmse:0.95813 [99] validation_0-rmse:0.69712 validation_1-rmse:0.95793 Wall time: 423 ms
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
eval_metric='rmse', gamma=0, gpu_id=-1, importance_type=None,
interaction_constraints='', learning_rate=0.300000012,
max_delta_step=0, max_depth=3, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=100, n_jobs=8,
num_parallel_tree=1, predictor='auto', random_state=42,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
tree_method='exact', validate_parameters=1, verbosity=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
eval_metric='rmse', gamma=0, gpu_id=-1, importance_type=None,
interaction_constraints='', learning_rate=0.300000012,
max_delta_step=0, max_depth=3, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=100, n_jobs=8,
num_parallel_tree=1, predictor='auto', random_state=42,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
tree_method='exact', validate_parameters=1, verbosity=1)Utilizaremos una profundidad de árbol de 3 y 7 estimadores
# Instancio el modelo
resampled_model = xgb.XGBRegressor(
booster = 'gbtree',
random_state = RANDOM_STATE,
n_estimators = 7, # número de veces que quiero que vaya haciendo la reponderación de los errores --> 100 modelos --> 100 predicciones
max_depth = 3, # profundidad del decisiontree (número de preguntas que quiero hacer)
verbosity = 1 # impresión de mensajes --> 0 (silent), 1 (warning), 2 (info), and 3 (debug)
)
# Entreno el modelo
resampled_model.fit(X_train_sample, y_train_sample)
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
gamma=0, gpu_id=-1, importance_type=None,
interaction_constraints='', learning_rate=0.300000012,
max_delta_step=0, max_depth=3, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=7, n_jobs=8,
num_parallel_tree=1, predictor='auto', random_state=42,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
tree_method='exact', validate_parameters=1, verbosity=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
gamma=0, gpu_id=-1, importance_type=None,
interaction_constraints='', learning_rate=0.300000012,
max_delta_step=0, max_depth=3, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=7, n_jobs=8,
num_parallel_tree=1, predictor='auto', random_state=42,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
tree_method='exact', validate_parameters=1, verbosity=1)# Hacemos la predicción
test_predictions_sampled = pd.DataFrame(
resampled_model.predict(X_test_sample),
index = X_test_sample.index,
columns = ['Prediction_Sample'])
# juntamos el target y las predicciones del sample
results_df_sample = y_test_sample.join(test_predictions_sampled)
# renombro las columnas
results_df_sample.columns = ['Target_Sample', 'Prediction_Sample']
# Ordeno de mayor a menor gasto
results_df_sample.sort_values(by = 'Target_Sample', ascending = False).head(10)
| Target_Sample | Prediction_Sample | |
|---|---|---|
| sessionId | ||
| 4759981878863963838_1490729055 | 8.015360 | 4.284503 |
| 5283278344963059120_1471737021 | 7.279850 | 3.414380 |
| 7107718938057621237_1473715800 | 7.081297 | 4.284503 |
| 7261302704200810675_1487644175 | 6.883042 | 4.181097 |
| 4863941202505455588_1494009960 | 6.685112 | 4.215508 |
| 4471415710206918415_1478235992 | 6.633318 | 3.441290 |
| 0432606793105704004_1485327772 | 6.555072 | 4.073001 |
| 5087617268001908286_1478632325 | 6.405212 | 3.855101 |
| 253776212101014141_1488832562 | 6.398379 | 2.768567 |
| 0634506282991036601_1493748688 | 6.387300 | 3.612922 |
#Evaluación de resampled_model mediante métricas
results_df_sample['error_sample'] = results_df_sample['Target_Sample'] - results_df_sample['Prediction_Sample']
results_df_sample['squared_error_sample'] = results_df_sample['error_sample'] ** 2
results_df_sample['rooted_squared_error_sample'] = np.sqrt(results_df_sample['squared_error_sample'])
mse_sample = results_df_sample['squared_error_sample'].mean()
rmse_sample = np.sqrt(mse_sample)
print(f'MSE_Sample: {np.round(mse_sample,4)} - RMSE_Sample: {np.round(rmse_sample,4)}')
MSE_Sample: 0.9247 - RMSE_Sample: 0.9616
# distribución del error del modelo sample de la gente que compra
plt.figure(figsize=(15,5))
sns.distplot(
results_df_sample[results_df_sample['Target_Sample'] > 0]['rooted_squared_error_sample'],
fit=stats.norm
);
Antes teníamos una media de los errores entorno al 3, ahora la tenemos entorno al 1. Ahora la media del error entre la gente que me compra es muchísimo menor. Por lo tanto el modelo rebalanceado va a funcionar mucho mejor que el anterior.
# Distribución del Prediction_Sample del modelo sample de la gente que compra
plt.figure(figsize=(15,5))
sns.distplot(
results_df_sample[results_df_sample['Target_Sample'] > 0]['Prediction_Sample'],
fit=stats.norm
);
# Distribución del target del modelo sample de la gente que compra
plt.figure(figsize=(15,5))
sns.distplot(
results_df_sample[results_df_sample['Target_Sample'] > 0]['Target_Sample'],
fit=stats.norm
);
Generamos funciones para hacerlo todo de golpe
def result_xgbTree (X_train, y_train, X_test, y_test, random_state, max_depth, n_estimators):
'''
Target vs Prediction en un problema de regresión utilizando el XGBRegressor.
Evaluación del modelo mediante las métricas mse y rmse
'''
# Instancio el modelo
model = xgb.XGBRegressor(booster = 'gbtree', random_state = random_state, n_estimators = n_estimators, max_depth = max_depth, verbosity = 1)
# Entreno el modelo
model.fit(X_train, y_train)
# Hacemos la predicción y guardamos en un dataframe
test_prediction = pd.DataFrame(model.predict(X_test), index = X_test.index, columns = ['Prediction'])
# result_df (target vs prediction)
results_df = y_test.join(test_prediction)
# renombro las columnas del result_df
results_df.columns = ['Target', 'Prediction']
#Evaluación del model mediante métricas mse y rmse
results_df['error'] = results_df['Target'] - results_df['Prediction']
results_df['squared_error'] = results_df['error'] ** 2
results_df['rooted_squared_error'] = np.sqrt(results_df['squared_error'])
mse = results_df['squared_error'].mean()
rmse = np.sqrt(mse)
print(f'MSE: {np.round(mse,4)} - RMSE: {np.round(rmse,4)}')
return results_df
results_df_sample_tree = result_xgbTree (
X_train = X_train_sample ,
y_train = y_train_sample,
X_test = X_test_sample,
y_test = y_test_sample,
random_state = RANDOM_STATE,
max_depth = 3,
n_estimators = 7
)
results_df_sample_tree.head()
MSE: 0.9247 - RMSE: 0.9616
| Target | Prediction | error | squared_error | rooted_squared_error | |
|---|---|---|---|---|---|
| sessionId | |||||
| 5059380332023098976_1486429189 | 0.000000 | 0.030258 | -0.030258 | 0.000916 | 0.030258 |
| 2833030312056761790_1490386591 | 0.000000 | 0.441446 | -0.441446 | 0.194875 | 0.441446 |
| 5415086779040891341_1492124532 | 2.707383 | 2.496804 | 0.210579 | 0.044343 | 0.210579 |
| 580589631805288488_1481695487 | 0.000000 | 0.178037 | -0.178037 | 0.031697 | 0.178037 |
| 3068571312311710401_1487883929 | 0.000000 | 0.030258 | -0.030258 | 0.000916 | 0.030258 |
def figure_xgb (dataframe):
'''
Distribución de los errores, de las predicciones y del target
'''
for img in [dataframe.columns[4], dataframe.columns[1], dataframe.columns[0]]:
plt.figure(figsize=(15,5))
sns.distplot(
dataframe[dataframe['Target'] > 0][img],
fit=stats.norm
);
figure_xgb (dataframe = results_df_sample_tree)
Feature Importances
# Atributos más importantes
xgb_feature_importances = pd.Series(
resampled_model.feature_importances_,
index = X_train_sample.columns
).sort_values(ascending = False).head(20)
xgb_feature_importances
totals.hits 0.479780 geoNetwork.continent_Americas 0.162018 channelGrouping_Organic Search 0.079769 device.deviceCategory_mobile 0.055289 visitHour 0.048785 visitNumber 0.041897 device.operatingSystem_Macintosh 0.028976 device.browser_Chrome 0.024719 channelGrouping_Direct 0.023122 channelGrouping_Referral 0.022575 device.operatingSystem_Chrome OS 0.020643 monthDay 0.012429 geoNetwork.subContinent_Middle Africa 0.000000 geoNetwork.subContinent_Southern Africa 0.000000 geoNetwork.subContinent_Western Asia 0.000000 geoNetwork.subContinent_Western Africa 0.000000 geoNetwork.subContinent_Southern Europe 0.000000 geoNetwork.subContinent_Southern Asia 0.000000 geoNetwork.subContinent_Melanesia 0.000000 geoNetwork.subContinent_Eastern Europe 0.000000 dtype: float32
# Visualizamos los atributos importantes
plt.figure(figsize=(15,5))
xgb_feature_importances.plot(kind = 'bar', ylabel = 'Coeficientes de ponderación')
plt.title('Feature Importances');
A la hora de llevarlo a producción, me quedaría con los 12 primeros atributos
2.- XGBoost --> con Regresión Lineal
def result_xgbLineal (X_train, y_train, X_test, y_test):
'''
Target vs Prediction en un problema de regresión utilizando el XGBRegressor.
Evaluación del modelo mediante las métricas mse y rmse
'''
# Instancio el modelo
model = xgb.XGBRegressor(booster = 'gblinear', verbosity = 1)
# Entreno el modelo
model.fit(X_train, y_train)
# Hacemos la predicción y guardamos en un dataframe
test_prediction = pd.DataFrame(model.predict(X_test), index = X_test.index, columns = ['Prediction'])
# result_df (target vs prediction)
results_df = y_test.join(test_prediction)
# renombro las columnas del result_df
results_df.columns = ['Target', 'Prediction']
#Evaluación del model mediante métricas mse y rmse
results_df['error'] = results_df['Target'] - results_df['Prediction']
results_df['squared_error'] = results_df['error'] ** 2
results_df['rooted_squared_error'] = np.sqrt(results_df['squared_error'])
mse = results_df['squared_error'].mean()
rmse = np.sqrt(mse)
print(f'MSE: {np.round(mse,4)} - RMSE: {np.round(rmse,4)}')
return results_df
results_df_sample_lineal = result_xgbLineal (X_train = X_train_sample, y_train = y_train_sample, X_test = X_test_sample, y_test = y_test_sample)
results_df_sample_lineal.sort_values(by = 'Target', ascending = False)
MSE: 1.1725 - RMSE: 1.0828
| Target | Prediction | error | squared_error | rooted_squared_error | |
|---|---|---|---|---|---|
| sessionId | |||||
| 4759981878863963838_1490729055 | 8.015360 | 5.069016 | 2.946344 | 8.680945 | 2.946344 |
| 5283278344963059120_1471737021 | 7.279850 | 3.617985 | 3.661865 | 13.409255 | 3.661865 |
| 7107718938057621237_1473715800 | 7.081297 | 4.058229 | 3.023068 | 9.138939 | 3.023068 |
| 7261302704200810675_1487644175 | 6.883042 | 4.323370 | 2.559673 | 6.551925 | 2.559673 |
| 4863941202505455588_1494009960 | 6.685112 | 4.124288 | 2.560824 | 6.557817 | 2.560824 |
| ... | ... | ... | ... | ... | ... |
| 87109475150246226_1489667378 | 0.000000 | 0.047116 | -0.047116 | 0.002220 | 0.047116 |
| 4379775968824328227_1483537542 | 0.000000 | 0.006856 | -0.006856 | 0.000047 | 0.006856 |
| 7067296665967668087_1488199537 | 0.000000 | -0.410359 | 0.410359 | 0.168395 | 0.410359 |
| 9770415850483050649_1490394558 | 0.000000 | 1.620343 | -1.620343 | 2.625511 | 1.620343 |
| 8822111466921247316_1481795090 | 0.000000 | 0.636169 | -0.636169 | 0.404711 | 0.636169 |
1088 rows × 5 columns
results_df_sample_tree
| Target | Prediction | error | squared_error | rooted_squared_error | |
|---|---|---|---|---|---|
| sessionId | |||||
| 5059380332023098976_1486429189 | 0.000000 | 0.030258 | -0.030258 | 0.000916 | 0.030258 |
| 2833030312056761790_1490386591 | 0.000000 | 0.441446 | -0.441446 | 0.194875 | 0.441446 |
| 5415086779040891341_1492124532 | 2.707383 | 2.496804 | 0.210579 | 0.044343 | 0.210579 |
| 580589631805288488_1481695487 | 0.000000 | 0.178037 | -0.178037 | 0.031697 | 0.178037 |
| 3068571312311710401_1487883929 | 0.000000 | 0.030258 | -0.030258 | 0.000916 | 0.030258 |
| ... | ... | ... | ... | ... | ... |
| 8293630998886357548_1473855669 | 0.000000 | 0.385314 | -0.385314 | 0.148467 | 0.385314 |
| 1022140804745247367_1476598024 | 0.000000 | 2.785280 | -2.785280 | 7.757782 | 2.785280 |
| 0769891055069398469_1475454049 | 3.217675 | 3.675765 | -0.458089 | 0.209846 | 0.458089 |
| 09972074242749395_1495180321 | 0.000000 | 0.030258 | -0.030258 | 0.000916 | 0.030258 |
| 7187268430836970062_1472604143 | 3.328268 | 2.814122 | 0.514146 | 0.264346 | 0.514146 |
1088 rows × 5 columns
Comparamos el xgbregressor aplicando decision tree con el xgbregressor aplicando regresión lineal para el dataset rebalanceado
comp_results_df_sample = results_df_sample_tree[['Target', 'Prediction', 'error']]
comp_results_df_sample.columns = ['Target', 'Prediction_Tree', 'Error_Tree']
comp_results_df_sample['Prediction_Lineal'] = results_df_sample_lineal['Prediction']
comp_results_df_sample['Error_Lineal'] = results_df_sample_lineal['error']
comp_results_df_sample.sort_values(by='Target', ascending=False)
| Target | Prediction_Tree | Error_Tree | Prediction_Lineal | Error_Lineal | |
|---|---|---|---|---|---|
| sessionId | |||||
| 4759981878863963838_1490729055 | 8.015360 | 4.284503 | 3.730858 | 5.069016 | 2.946344 |
| 5283278344963059120_1471737021 | 7.279850 | 3.414380 | 3.865470 | 3.617985 | 3.661865 |
| 7107718938057621237_1473715800 | 7.081297 | 4.284503 | 2.796794 | 4.058229 | 3.023068 |
| 7261302704200810675_1487644175 | 6.883042 | 4.181097 | 2.701946 | 4.323370 | 2.559673 |
| 4863941202505455588_1494009960 | 6.685112 | 4.215508 | 2.469604 | 4.124288 | 2.560824 |
| ... | ... | ... | ... | ... | ... |
| 87109475150246226_1489667378 | 0.000000 | 0.030258 | -0.030258 | 0.047116 | -0.047116 |
| 4379775968824328227_1483537542 | 0.000000 | 0.030258 | -0.030258 | 0.006856 | -0.006856 |
| 7067296665967668087_1488199537 | 0.000000 | 0.136881 | -0.136881 | -0.410359 | 0.410359 |
| 9770415850483050649_1490394558 | 0.000000 | 0.232295 | -0.232295 | 1.620343 | -1.620343 |
| 8822111466921247316_1481795090 | 0.000000 | 0.037306 | -0.037306 | 0.636169 | -0.636169 |
1088 rows × 5 columns
Distribuciones de ambos errores
for img in [comp_results_df_sample.columns[2], comp_results_df_sample.columns[4]]:
plt.figure(figsize=(15,5))
sns.distplot(
comp_results_df_sample[comp_results_df_sample['Target'] > 0][img],
fit=stats.norm
);
Tanto las métricas como las distribuciones de los errores de ambos algoritmos son muy parecidos.
Next Steps:
Vamos a hacer 10 iteracciones.
# Defino el cross validation
kf = model_selection.KFold(
n_splits = 10,
# k=10 (10 iteracciones con diferentes particiones aleatorias Train/Test)
# Tengo 10 datasets distintos --> 10 modelos distintos
random_state = 42,
# Si shuffle=True, random_state afectaría el orden de los índices, que controla la aleatoriedad de los fold
# Si shuffle=False, random_state no tiene efecto
shuffle = True # mezclar los datos antes de dividirlos en batches.
)
# Genera los índices de las muestras del train y del test (df_development)
kf.split(
df_dev, # Le paso el development
y = None, # array del target
groups = None # array de etiquetas para las muestras usadas en el split
)
<generator object _BaseKFold.split at 0x000001F8E79163C0>
# Indices del train y del train de las 10 iteracciones
for train_index, test_index in kf.split(df_dev):
print("TRAIN:", train_index, "Samples_Train:", len(train_index), '\n')
print("TEST:", test_index, "Samples_Test:", len(test_index))
TRAIN: [ 0 1 2 ... 76497 76498 76499] Samples_Train: 68850 TEST: [ 35 39 75 ... 76442 76480 76489] Samples_Test: 7650 TRAIN: [ 0 1 2 ... 76497 76498 76499] Samples_Train: 68850 TEST: [ 4 6 54 ... 76474 76482 76495] Samples_Test: 7650 TRAIN: [ 0 1 2 ... 76496 76498 76499] Samples_Train: 68850 TEST: [ 12 32 33 ... 76453 76457 76497] Samples_Test: 7650 TRAIN: [ 0 1 2 ... 76497 76498 76499] Samples_Train: 68850 TEST: [ 7 11 21 ... 76475 76486 76491] Samples_Test: 7650 TRAIN: [ 2 4 5 ... 76497 76498 76499] Samples_Train: 68850 TEST: [ 0 1 3 ... 76466 76487 76492] Samples_Test: 7650 TRAIN: [ 0 1 2 ... 76497 76498 76499] Samples_Train: 68850 TEST: [ 10 43 46 ... 76443 76450 76481] Samples_Test: 7650 TRAIN: [ 0 1 2 ... 76495 76496 76497] Samples_Train: 68850 TEST: [ 8 29 63 ... 76490 76498 76499] Samples_Test: 7650 TRAIN: [ 0 1 2 ... 76497 76498 76499] Samples_Train: 68850 TEST: [ 14 18 19 ... 76477 76478 76494] Samples_Test: 7650 TRAIN: [ 0 1 2 ... 76497 76498 76499] Samples_Train: 68850 TEST: [ 5 15 16 ... 76476 76493 76496] Samples_Test: 7650 TRAIN: [ 0 1 3 ... 76497 76498 76499] Samples_Train: 68850 TEST: [ 2 28 55 ... 76470 76485 76488] Samples_Test: 7650
# Lista de los accuracy de las 10 iteracciones
scores_list = [] # almaceno los acc de los modelos
for train_index, test_index in kf.split(df_dev):
print("TRAIN:", train_index, "Samples_Train:", len(train_index)) # índices y número de filas del train
print("TEST:", test_index, "Samples_Test:", len(test_index)) # índices y número de filas del test
X_train, X_test = df_dev_X.iloc[train_index], df_dev_X.iloc[test_index]
y_train, y_test = df_dev_y.iloc[train_index], df_dev_y.iloc[test_index]
model = xgb.XGBRegressor(booster = 'gbtree', random_state = RANDOM_STATE, n_estimators = 7, max_depth = 3, verbosity = 1) # Instancio el modelo
model.fit(X_train, y_train) # Entreno el modelo
_score = model.score(X_test, y_test) # Determino el accuracy del modelo
scores_list.append(_score) # Almaceno el accuracy del modelo en una lista
print("Acc:",_score,"\n") # Imprimo el acc del modelo
print("Accuracy de cada iteracción:")
scores_list # lista de acc de todos los modelos
TRAIN: [ 0 1 2 ... 76497 76498 76499] Samples_Train: 68850 TEST: [ 35 39 75 ... 76442 76480 76489] Samples_Test: 7650 Acc: 0.21710390425892379 TRAIN: [ 0 1 2 ... 76497 76498 76499] Samples_Train: 68850 TEST: [ 4 6 54 ... 76474 76482 76495] Samples_Test: 7650 Acc: 0.2009898312021221 TRAIN: [ 0 1 2 ... 76496 76498 76499] Samples_Train: 68850 TEST: [ 12 32 33 ... 76453 76457 76497] Samples_Test: 7650 Acc: 0.20881788037936977 TRAIN: [ 0 1 2 ... 76497 76498 76499] Samples_Train: 68850 TEST: [ 7 11 21 ... 76475 76486 76491] Samples_Test: 7650 Acc: 0.2138669547671539 TRAIN: [ 2 4 5 ... 76497 76498 76499] Samples_Train: 68850 TEST: [ 0 1 3 ... 76466 76487 76492] Samples_Test: 7650 Acc: 0.19343804413094068 TRAIN: [ 0 1 2 ... 76497 76498 76499] Samples_Train: 68850 TEST: [ 10 43 46 ... 76443 76450 76481] Samples_Test: 7650 Acc: 0.1482364930540393 TRAIN: [ 0 1 2 ... 76495 76496 76497] Samples_Train: 68850 TEST: [ 8 29 63 ... 76490 76498 76499] Samples_Test: 7650 Acc: 0.24235845658521926 TRAIN: [ 0 1 2 ... 76497 76498 76499] Samples_Train: 68850 TEST: [ 14 18 19 ... 76477 76478 76494] Samples_Test: 7650 Acc: 0.1994962549089364 TRAIN: [ 0 1 2 ... 76497 76498 76499] Samples_Train: 68850 TEST: [ 5 15 16 ... 76476 76493 76496] Samples_Test: 7650 Acc: 0.22642530916338965 TRAIN: [ 0 1 3 ... 76497 76498 76499] Samples_Train: 68850 TEST: [ 2 28 55 ... 76470 76485 76488] Samples_Test: 7650 Acc: 0.25120665704575895 Accuracy de cada iteracción:
[0.21710390425892379, 0.2009898312021221, 0.20881788037936977, 0.2138669547671539, 0.19343804413094068, 0.1482364930540393, 0.24235845658521926, 0.1994962549089364, 0.22642530916338965, 0.25120665704575895]